wangkun 1 gadu atpakaļ
vecāks
revīzija
ee46c592fe
2 mainītis faili ar 116 papildinājumiem un 113 dzēšanām
  1. 18 18
      main/process.sh
  2. 98 95
      xigua/xigua_recommend/xigua_recommend_scheduling.py

+ 18 - 18
main/process.sh

@@ -189,24 +189,24 @@ fi
 #  echo "$(date "+%Y-%m-%d %H:%M:%S") 西瓜推荐榜爬虫策略 进程状态正常" >> ${log_path}
 #fi
 
-# 西瓜搜索爬虫策略
-if [[ "$time" > "00:00:00" ]] && [[ "$time" < "00:10:00" ]]; then
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 西瓜搜索爬虫策略 进程状态" >> ${log_path}
-  ps -ef | grep "run_xigua_search_new" | grep -v "grep"
-  if [ "$?" -eq 1 ];then
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 西瓜搜索爬虫策略, 异常停止, 正在重启!" >> ${log_path}
-    if [ ${env} = "dev" ];then
-      cd ${piaoquan_crawler_dir} && sh main/scheduling_main.sh ./xigua/xigua_main/run_xigua_search_new.py --log_type="search" --crawler="xigua" --env="dev" xigua/logs/nohup-search.log
-    else
-      cd ${piaoquan_crawler_dir} && /usr/bin/sh main/scheduling_main.sh ./xigua/xigua_main/run_xigua_search_new.py --log_type="search" --crawler="xigua" --env="prod" xigua/logs/nohup-search.log
-    fi
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
-  else
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 西瓜搜索爬虫策略 进程状态正常" >> ${log_path}
-  fi
-else
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 不在任务启动时间范围: 西瓜搜索爬虫" >> ${log_path}
-fi
+## 西瓜搜索爬虫策略
+#if [[ "$time" > "00:00:00" ]] && [[ "$time" < "00:10:00" ]]; then
+#  echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 西瓜搜索爬虫策略 进程状态" >> ${log_path}
+#  ps -ef | grep "run_xigua_search_new" | grep -v "grep"
+#  if [ "$?" -eq 1 ];then
+#    echo "$(date "+%Y-%m-%d %H:%M:%S") 西瓜搜索爬虫策略, 异常停止, 正在重启!" >> ${log_path}
+#    if [ ${env} = "dev" ];then
+#      cd ${piaoquan_crawler_dir} && sh main/scheduling_main.sh ./xigua/xigua_main/run_xigua_search_new.py --log_type="search" --crawler="xigua" --env="dev" xigua/logs/nohup-search.log
+#    else
+#      cd ${piaoquan_crawler_dir} && /usr/bin/sh main/scheduling_main.sh ./xigua/xigua_main/run_xigua_search_new.py --log_type="search" --crawler="xigua" --env="prod" xigua/logs/nohup-search.log
+#    fi
+#    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
+#  else
+#    echo "$(date "+%Y-%m-%d %H:%M:%S") 西瓜搜索爬虫策略 进程状态正常" >> ${log_path}
+#  fi
+#else
+#  echo "$(date "+%Y-%m-%d %H:%M:%S") 不在任务启动时间范围: 西瓜搜索爬虫" >> ${log_path}
+#fi
 
 # youtube定向爬虫策略
 echo "$(date "+%Y-%m-%d %H:%M:%S") 正在监测 youtube定向爬虫策略 进程状态" >> ${log_path}

+ 98 - 95
xigua/xigua_recommend/xigua_recommend_scheduling.py

@@ -605,101 +605,104 @@ class XiguarecommendScheduling:
         queryCount = 1
         while True:
             Common.logger(log_type, crawler).info(f"正在抓取第{queryCount}页视频")
-            signature = cls.get_signature(env)
-            if signature is None:
-                Common.logger(log_type, crawler).warning(f"signature:{signature}")
-                time.sleep(1)
-                continue
-            url = "https://www.ixigua.com/api/feedv2/feedById?"
-            params = {
-                "channelId": "94349543909",
-                "count": "9",
-                "maxTime": str(int(time.time())),
-                # "maxTime": "1683190690",
-                "queryCount": str(queryCount),
-                "_signature": signature,
-                "request_from": "701",
-                "offset": "0",
-                "referrer:": "https://open.weixin.qq.com/",
-                "aid": "1768",
-                "msToken": "XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua",
-                # "X-Bogus": "DFSzswVOx7bANt0TtCAcOFm4pIkR",
-            }
-            headers = {
-                'referer': 'https://www.ixigua.com/',
-                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
-                'authority': 'www.ixigua.com',
-                'accept': 'application/json, text/plain, */*',
-                'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
-                'cache-control': 'no-cache',
-                # 'cookie': 'ttcid=5d8f917a525e46759dc886296bf1111b69; MONITOR_WEB_ID=ad1c8360-d4c9-4fa2-a801-d9fd68dfc1b2; s_v_web_id=verify_lh8vaa6v_VI4RQ0ET_nVbq_4PXw_8mfN_7Xp6wdLOZi08; passport_csrf_token=0e7c6992cb6170c9db034c3696191fff; passport_csrf_token_default=0e7c6992cb6170c9db034c3696191fff; odin_tt=b102690fef38bf07c400e3c69cdc27627701802bdd816fa827e3721c33607c4d2c0cbef09fe99c7d370e4a9e9e11c263; sid_guard=8dec4ecbe52cbdcff99dafe622b586b4%7C1683189144%7C3024002%7CThu%2C+08-Jun-2023+08%3A32%3A26+GMT; uid_tt=1dccbeaf685e24afd018fec335f3151d; uid_tt_ss=1dccbeaf685e24afd018fec335f3151d; sid_tt=8dec4ecbe52cbdcff99dafe622b586b4; sessionid=8dec4ecbe52cbdcff99dafe622b586b4; sessionid_ss=8dec4ecbe52cbdcff99dafe622b586b4; sid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; ssid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; support_webp=true; support_avif=true; csrf_session_id=9dd5d8287d4f075ae24ff163cd22e51f; msToken=XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua; ixigua-a-s=1; tt_scid=UTduWO4ij7cX6YKx23sDuV4zjvFkGFtFk5ZBhEnd1lJ1EZBykStzU7tbWQOSzGdE0fc6; ttwid=1%7C4zaTJmlaHpEa8rAB-KjREdxT3sNBUJWrAzRJnNvqExQ%7C1683198318%7Cffc2eef612caab19a0db93b4cec27e21a6230f9b82ab4bf5b1c6193d082baab1',
-                'pragma': 'no-cache',
-                'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
-                'sec-ch-ua-mobile': '?0',
-                'sec-ch-ua-platform': '"macOS"',
-                'sec-fetch-dest': 'empty',
-                'sec-fetch-mode': 'cors',
-                'sec-fetch-site': 'same-origin',
-                # 'tt-anti-token': '95Ny0vj4Q-90dd9b91193b34ce554cc2861439b9629d897723f4d33719b9747d7d18a2ff7c',
-                # 'x-secsdk-csrf-token': '000100000001ecb8f07e247a89e289b3ab55f3c967a8e88f88aa0addb1ddca9d3e36f35d7999175be79b8699c881'
-            }
-            urllib3.disable_warnings()
-            s = requests.session()
-            # max_retries=3 重试3次
-            s.mount('http://', HTTPAdapter(max_retries=3))
-            s.mount('https://', HTTPAdapter(max_retries=3))
-            response = requests.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
-            response.close()
-            queryCount += 1
-            if response.status_code != 200:
-                Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
-                return
-            elif 'data' not in response.text:
-                Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
-                return
-            elif 'channelFeed' not in response.json()['data']:
-                Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
-                return
-            elif 'Data' not in response.json()['data']['channelFeed']:
-                Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
-                return
-            elif len(response.json()['data']['channelFeed']['Data']) == 0:
-                Common.logger(log_type, crawler).warning(f"没有更多数据啦 ~ :{response.json()}\n")
-                return
-            else:
-                feeds = response.json()['data']['channelFeed']['Data']
-                for i in range(len(feeds)):
-                    try:
-                        item_id = feeds[i].get("data", {}).get("item_id", "")
-                        if item_id == "":
-                            Common.logger(log_type, crawler).info("无效视频\n")
-                            continue
-                        video_dict = cls.get_video_info(log_type, crawler, item_id)
-                        if video_dict is None:
-                            Common.logger(log_type, crawler).info("无效视频\n")
-                            continue
-                        for k, v in video_dict.items():
-                            Common.logger(log_type, crawler).info(f"{k}:{v}")
-                        if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
-                            Common.logger(log_type, crawler).info("不满足抓取规则\n")
-                        elif any(str(word) if str(word) in video_dict["video_title"] else False
-                                 for word in get_config_from_mysql(log_type=log_type,
-                                                                   source=crawler,
-                                                                   env=env,
-                                                                   text="filter",
-                                                                   action="")) is True:
-                            Common.logger(log_type, crawler).info('已中过滤词\n')
-                        elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
-                            Common.logger(log_type, crawler).info('视频已下载\n')
-                        else:
-                            cls.download_publish(log_type=log_type,
-                                                 crawler=crawler,
-                                                 our_uid=our_uid,
-                                                 video_dict=video_dict,
-                                                 rule_dict=rule_dict,
-                                                 env=env)
-                    except Exception as e:
-                        Common.logger(log_type, crawler).error(f"抓取单条视频时异常:{e}\n")
+            try:
+                signature = cls.get_signature(env)
+                if signature is None:
+                    Common.logger(log_type, crawler).warning(f"signature:{signature}")
+                    time.sleep(1)
+                    continue
+                url = "https://www.ixigua.com/api/feedv2/feedById?"
+                params = {
+                    "channelId": "94349543909",
+                    "count": "9",
+                    "maxTime": str(int(time.time())),
+                    # "maxTime": "1683190690",
+                    "queryCount": str(queryCount),
+                    "_signature": signature,
+                    "request_from": "701",
+                    "offset": "0",
+                    "referrer:": "https://open.weixin.qq.com/",
+                    "aid": "1768",
+                    "msToken": "XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua",
+                    # "X-Bogus": "DFSzswVOx7bANt0TtCAcOFm4pIkR",
+                }
+                headers = {
+                    'referer': 'https://www.ixigua.com/',
+                    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
+                    'authority': 'www.ixigua.com',
+                    'accept': 'application/json, text/plain, */*',
+                    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+                    'cache-control': 'no-cache',
+                    # 'cookie': 'ttcid=5d8f917a525e46759dc886296bf1111b69; MONITOR_WEB_ID=ad1c8360-d4c9-4fa2-a801-d9fd68dfc1b2; s_v_web_id=verify_lh8vaa6v_VI4RQ0ET_nVbq_4PXw_8mfN_7Xp6wdLOZi08; passport_csrf_token=0e7c6992cb6170c9db034c3696191fff; passport_csrf_token_default=0e7c6992cb6170c9db034c3696191fff; odin_tt=b102690fef38bf07c400e3c69cdc27627701802bdd816fa827e3721c33607c4d2c0cbef09fe99c7d370e4a9e9e11c263; sid_guard=8dec4ecbe52cbdcff99dafe622b586b4%7C1683189144%7C3024002%7CThu%2C+08-Jun-2023+08%3A32%3A26+GMT; uid_tt=1dccbeaf685e24afd018fec335f3151d; uid_tt_ss=1dccbeaf685e24afd018fec335f3151d; sid_tt=8dec4ecbe52cbdcff99dafe622b586b4; sessionid=8dec4ecbe52cbdcff99dafe622b586b4; sessionid_ss=8dec4ecbe52cbdcff99dafe622b586b4; sid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; ssid_ucp_v1=1.0.0-KGVhZTIxYjFlNzRlZTNhZjk5MjNlNzk2NGRhOWJlYzZiNGI5NzBhMzYKFQiu3d-eqQIQmNvNogYYGCAMOAhACxoCaGwiIDhkZWM0ZWNiZTUyY2JkY2ZmOTlkYWZlNjIyYjU4NmI0; support_webp=true; support_avif=true; csrf_session_id=9dd5d8287d4f075ae24ff163cd22e51f; msToken=XDpSA6_ZPP-gAkkBV-_WRQvNpG20uUUGPwf3E-S-txhznjBcXNbK2sbOuSpF3U7Jki6R9HwLDPeW4Gj7n6PURPTKrKLEs8J-ieFrwXDvMp2DX94ZoMua; ixigua-a-s=1; tt_scid=UTduWO4ij7cX6YKx23sDuV4zjvFkGFtFk5ZBhEnd1lJ1EZBykStzU7tbWQOSzGdE0fc6; ttwid=1%7C4zaTJmlaHpEa8rAB-KjREdxT3sNBUJWrAzRJnNvqExQ%7C1683198318%7Cffc2eef612caab19a0db93b4cec27e21a6230f9b82ab4bf5b1c6193d082baab1',
+                    'pragma': 'no-cache',
+                    'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
+                    'sec-ch-ua-mobile': '?0',
+                    'sec-ch-ua-platform': '"macOS"',
+                    'sec-fetch-dest': 'empty',
+                    'sec-fetch-mode': 'cors',
+                    'sec-fetch-site': 'same-origin',
+                    # 'tt-anti-token': '95Ny0vj4Q-90dd9b91193b34ce554cc2861439b9629d897723f4d33719b9747d7d18a2ff7c',
+                    # 'x-secsdk-csrf-token': '000100000001ecb8f07e247a89e289b3ab55f3c967a8e88f88aa0addb1ddca9d3e36f35d7999175be79b8699c881'
+                }
+                urllib3.disable_warnings()
+                s = requests.session()
+                # max_retries=3 重试3次
+                s.mount('http://', HTTPAdapter(max_retries=3))
+                s.mount('https://', HTTPAdapter(max_retries=3))
+                response = requests.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
+                response.close()
+                queryCount += 1
+                if response.status_code != 200:
+                    Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
+                    return
+                elif 'data' not in response.text:
+                    Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
+                    return
+                elif 'channelFeed' not in response.json()['data']:
+                    Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
+                    return
+                elif 'Data' not in response.json()['data']['channelFeed']:
+                    Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
+                    return
+                elif len(response.json()['data']['channelFeed']['Data']) == 0:
+                    Common.logger(log_type, crawler).warning(f"没有更多数据啦 ~ :{response.json()}\n")
+                    return
+                else:
+                    feeds = response.json()['data']['channelFeed']['Data']
+                    for i in range(len(feeds)):
+                        try:
+                            item_id = feeds[i].get("data", {}).get("item_id", "")
+                            if item_id == "":
+                                Common.logger(log_type, crawler).info("无效视频\n")
+                                continue
+                            video_dict = cls.get_video_info(log_type, crawler, item_id)
+                            if video_dict is None:
+                                Common.logger(log_type, crawler).info("无效视频\n")
+                                continue
+                            for k, v in video_dict.items():
+                                Common.logger(log_type, crawler).info(f"{k}:{v}")
+                            if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
+                                Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                            elif any(str(word) if str(word) in video_dict["video_title"] else False
+                                     for word in get_config_from_mysql(log_type=log_type,
+                                                                       source=crawler,
+                                                                       env=env,
+                                                                       text="filter",
+                                                                       action="")) is True:
+                                Common.logger(log_type, crawler).info('已中过滤词\n')
+                            elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
+                                Common.logger(log_type, crawler).info('视频已下载\n')
+                            else:
+                                cls.download_publish(log_type=log_type,
+                                                     crawler=crawler,
+                                                     our_uid=our_uid,
+                                                     video_dict=video_dict,
+                                                     rule_dict=rule_dict,
+                                                     env=env)
+                        except Exception as e:
+                            Common.logger(log_type, crawler).error(f"抓取单条视频时异常:{e}\n")
+            except Exception as e:
+                Common.logger(log_type, crawler).error(f"抓取第{queryCount}页时异常:{e}\n")
 
     @classmethod
     def download_publish(cls, log_type, crawler, our_uid, video_dict, rule_dict, env):