lierqiang 2 年 前
コミット
bdf05705cc

ファイルの差分が大きいため隠しています
+ 4 - 2
douyin/douyin_recommend/recommend_dy.py


+ 4 - 3
kuaishou/kuaishou_follow/kuaishou_follow.py

@@ -286,7 +286,7 @@ class KuaiShouFollow:
                           .replace("#", "").replace(".", "。").replace("\\", "") \
                           .replace(":", "").replace("*", "").replace("?", "") \
                           .replace("?", "").replace('"', "").replace("<", "") \
-                          .replace(">", "").replace("|", "").replace("@", "")[:40]
+                          .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
         if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
             return cls.random_title(log_type, crawler)
         else:
@@ -321,7 +321,8 @@ class KuaiShouFollow:
         }
 
         try:
-            response = requests.post(url=url, headers=headers, data=payload, proxies=Common.tunnel_proxies(), verify=False, timeout=10)
+            response = requests.post(url=url, headers=headers, data=payload, proxies=Common.tunnel_proxies(),
+                                     verify=False, timeout=10)
         except Exception as e:
             Common.logger(log_type, crawler).error(f"get_videoList:{e}\n")
             return
@@ -679,7 +680,7 @@ class KuaiShouFollow:
             except Exception as e:
                 Common.logger(log_type, crawler).info(f"用户:{user_name}, 抓取异常:{e}\n")
                 continue
-            sleep_time = random.randint(1,3)
+            sleep_time = random.randint(1, 3)
             Common.logger(log_type, crawler).info(f"休眠{sleep_time}秒\n")
             time.sleep(sleep_time)
 

+ 3 - 3
kuaishou/kuaishou_recommend/recommend_kuaishou.py

@@ -180,7 +180,7 @@ class KuaiShouRecommend:
                           .replace("#", "").replace(".", "。").replace("\\", "") \
                           .replace(":", "").replace("*", "").replace("?", "") \
                           .replace("?", "").replace('"', "").replace("<", "") \
-                          .replace(">", "").replace("|", "").replace("@", "")[:40]
+                          .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
         if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
             return cls.random_title(log_type, crawler)
         else:
@@ -209,7 +209,8 @@ class KuaiShouRecommend:
             headers = {
                 'Accept-Language': 'zh-CN,zh;q=0.9',
                 'Connection': 'keep-alive',
-                'Cookie': 'kpf=PC_WEB; clientid=3; did=web_aba004b1780f4d7174d0a2ff42da1f{r}7; kpn=KUAISHOU_VISION;'.format(r=r),
+                'Cookie': 'kpf=PC_WEB; clientid=3; did=web_aba004b1780f4d7174d0a2ff42da1f{r}7; kpn=KUAISHOU_VISION;'.format(
+                    r=r),
                 'Origin': 'https://www.kuaishou.com',
                 'Referer': 'https://www.kuaishou.com/new-reco',
                 'Sec-Fetch-Dest': 'empty',
@@ -516,4 +517,3 @@ class KuaiShouRecommend:
 
 if __name__ == "__main__":
     KuaiShouRecommend.get_videoList('recommend', 'kuaishou', '推荐抓取策略', 55440319, 'outer', 'prod', 'aliyun')
-

ファイルの差分が大きいため隠しています
+ 1 - 15
xiaoniangao/xiaoniangao_follow/xiaoniangao_follow.py


+ 91 - 75
xiaoniangao/xiaoniangao_hour/xiaoniangao_hour.py

@@ -10,12 +10,14 @@ import sys
 import time
 import requests
 import urllib3
+
 sys.path.append(os.getcwd())
 from common.common import Common
 from common.feishu import Feishu
 from common.publish import Publish
 from common.scheduling_db import MysqlHelper
 from common.public import filter_word
+
 proxies = {"http": None, "https": None}
 
 
@@ -53,7 +55,7 @@ class XiaoniangaoHour:
                         # 分享量
                         if int(video_dict["share_cnt"]) >= 0:
                             # 发布时间 <= 10 天
-                            if int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600*24*10:
+                            if int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600 * 24 * 10:
                                 return True
                             else:
                                 return False
@@ -70,7 +72,8 @@ class XiaoniangaoHour:
     @classmethod
     def get_expression(cls):
         # 表情列表
-        expression_list = ['📍', '⭕️', '🔥', '📣', '🎈', '⚡', '🔔', '🚩', '💢', '💎', '👉', '💓', '❗️', '🔴', '🔺', '♦️', '♥️', '👉', '👈', '🏆', '❤️\u200d🔥']
+        expression_list = ['📍', '⭕️', '🔥', '📣', '🎈', '⚡', '🔔', '🚩', '💢', '💎', '👉', '💓', '❗️', '🔴', '🔺', '♦️', '♥️', '👉',
+                           '👈', '🏆', '❤️\u200d🔥']
         # 符号列表
         char_list = ['...', '~~']
         return expression_list, char_list
@@ -109,56 +112,56 @@ class XiaoniangaoHour:
             "Referer": 'https://servicewechat.com/wxd7911e4c177690e4/624/page-frame.html'
         }
         data = {
-        "log_params": {
-            "page": "discover_rec",
-            "common": {
-                "brand": "iPhone",
-                "device": "iPhone 11",
-                "os": "iOS 14.7.1",
-                "weixinver": "8.0.20",
-                "srcver": "2.24.2",
-                "net": "wifi",
-                "scene": 1089
-            }
-        },
-        "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!750x500r/crop/750x500/interlace/1/format/jpg",
-        "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!80x80r/crop/80x80/interlace/1/format/jpg",
-        "share_width": 625,
-        "share_height": 500,
-        "ext": {
-            "fmid": 0,
-            "items": {}
-        },
-        "app": "xng",
-        "rec_scene": "discover_rec",
-        "log_common_params": {
-            "e": [{
-                "data": {
-                    "page": "discoverIndexPage",
-                    "topic": "recommend"
-                },
-                "ab": {}
-            }],
+            "log_params": {
+                "page": "discover_rec",
+                "common": {
+                    "brand": "iPhone",
+                    "device": "iPhone 11",
+                    "os": "iOS 14.7.1",
+                    "weixinver": "8.0.20",
+                    "srcver": "2.24.2",
+                    "net": "wifi",
+                    "scene": 1089
+                }
+            },
+            "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!750x500r/crop/750x500/interlace/1/format/jpg",
+            "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!80x80r/crop/80x80/interlace/1/format/jpg",
+            "share_width": 625,
+            "share_height": 500,
             "ext": {
-                "brand": "iPhone",
-                "device": "iPhone 11",
-                "os": "iOS 14.7.1",
-                "weixinver": "8.0.20",
-                "srcver": "2.24.3",
-                "net": "wifi",
-                "scene": "1089"
+                "fmid": 0,
+                "items": {}
+            },
+            "app": "xng",
+            "rec_scene": "discover_rec",
+            "log_common_params": {
+                "e": [{
+                    "data": {
+                        "page": "discoverIndexPage",
+                        "topic": "recommend"
+                    },
+                    "ab": {}
+                }],
+                "ext": {
+                    "brand": "iPhone",
+                    "device": "iPhone 11",
+                    "os": "iOS 14.7.1",
+                    "weixinver": "8.0.20",
+                    "srcver": "2.24.3",
+                    "net": "wifi",
+                    "scene": "1089"
+                },
+                "pj": "1",
+                "pf": "2",
+                "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29"
             },
-            "pj": "1",
-            "pf": "2",
-            "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29"
-        },
-        "refresh": False,
-        "token": uid_token_dict["token"],
-        "uid": uid_token_dict["uid"],
-        "proj": "ma",
-        "wx_ver": "8.0.20",
-        "code_ver": "3.62.0"
-    }
+            "refresh": False,
+            "token": uid_token_dict["token"],
+            "uid": uid_token_dict["uid"],
+            "proj": "ma",
+            "wx_ver": "8.0.20",
+            "code_ver": "3.62.0"
+        }
         urllib3.disable_warnings()
         r = requests.post(url=url, headers=headers, json=data, proxies=proxies, verify=False)
         if 'data' not in r.text or r.status_code != 200:
@@ -184,7 +187,8 @@ class XiaoniangaoHour:
                         .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
                         .replace(":", "").replace("*", "").replace("?", "") \
                         .replace("?", "").replace('"', "").replace("<", "") \
-                        .replace(">", "").replace("|", "").replace(" ", "").replace("#表情", "").replace("#符号", "")
+                        .replace(">", "").replace("|", "").replace(" ", "").replace("#表情", "").replace("#符号","").replace(
+                        '"', '').replace("'", '').replace('"', '').replace("'", '')
 
                     expression = cls.get_expression()
                     expression_list = expression[0]
@@ -253,7 +257,7 @@ class XiaoniangaoHour:
                     video_send_time = feeds[i]["t"]
                 else:
                     video_send_time = 0
-                publish_time_stamp = int(int(video_send_time)/1000)
+                publish_time_stamp = int(int(video_send_time) / 1000)
                 publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
 
                 # 用户名 / 头像
@@ -318,7 +322,8 @@ class XiaoniangaoHour:
                 elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
                     Common.logger(log_type, crawler).info('视频已下载\n')
                 # 过滤敏感词
-                elif any(str(word) if str(word) in video_title else False for word in filter_word(log_type, crawler, "小年糕", env)) is True:
+                elif any(str(word) if str(word) in video_title else False for word in
+                         filter_word(log_type, crawler, "小年糕", env)) is True:
                     Common.logger(log_type, crawler).info("视频已中过滤词\n")
                     time.sleep(1)
                 else:
@@ -431,7 +436,7 @@ class XiaoniangaoHour:
             hour_video_width = r.json()["data"]["w"]
             hour_video_height = r.json()["data"]["h"]
             hour_video_send_time = r.json()["data"]["t"]
-            publish_time_stamp = int(int(hour_video_send_time)/1000)
+            publish_time_stamp = int(int(hour_video_send_time) / 1000)
             publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
             hour_user_name = r.json()["data"]["user"]["nick"]
             hour_head_url = r.json()["data"]["user"]["hurl"]
@@ -479,20 +484,21 @@ class XiaoniangaoHour:
                 profile_mid = update_video_info["profile_mid"]
                 video_title = update_video_info["video_title"]
                 video_id = update_video_info["out_video_id"]
-                if datetime.datetime.now().hour == 10 and datetime.datetime.now().minute <=10:
+                if datetime.datetime.now().hour == 10 and datetime.datetime.now().minute <= 10:
                     video_info_dict = cls.get_video_info(log_type=log_type,
-                                                      crawler=crawler,
-                                                      p_id=profile_id,
-                                                      p_mid=profile_mid,
-                                                      v_title=video_title,
-                                                      v_id=video_id)
+                                                         crawler=crawler,
+                                                         p_id=profile_id,
+                                                         p_mid=profile_mid,
+                                                         v_title=video_title,
+                                                         v_id=video_id)
                     ten_play_cnt = video_info_dict['play_cnt']
                     Common.logger(log_type, crawler).info(f"ten_play_cnt:{ten_play_cnt}")
                     update_sql = f""" update crawler_xiaoniangao_hour set ten_play_cnt={ten_play_cnt} WHERE out_video_id="{video_id}"; """
                     # Common.logger(log_type, crawler).info(f"update_sql:{update_sql}")
                     MysqlHelper.update_values(log_type, crawler, update_sql, env)
-                    cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint, env)
-                elif datetime.datetime.now().hour == 15 and datetime.datetime.now().minute <=10:
+                    cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint,
+                                         env)
+                elif datetime.datetime.now().hour == 15 and datetime.datetime.now().minute <= 10:
                     video_info_dict = cls.get_video_info(log_type=log_type,
                                                          crawler=crawler,
                                                          p_id=profile_id,
@@ -504,8 +510,9 @@ class XiaoniangaoHour:
                     update_sql = f""" update crawler_xiaoniangao_hour set fifteen_play_cnt={fifteen_play_cnt} WHERE out_video_id="{video_id}"; """
                     # Common.logger(log_type, crawler).info(f"update_sql:{update_sql}")
                     MysqlHelper.update_values(log_type, crawler, update_sql, env)
-                    cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint, env)
-                elif datetime.datetime.now().hour == 20 and datetime.datetime.now().minute <=10:
+                    cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint,
+                                         env)
+                elif datetime.datetime.now().hour == 20 and datetime.datetime.now().minute <= 10:
                     video_info_dict = cls.get_video_info(log_type=log_type,
                                                          crawler=crawler,
                                                          p_id=profile_id,
@@ -517,7 +524,8 @@ class XiaoniangaoHour:
                     update_sql = f""" update crawler_xiaoniangao_hour set twenty_play_cnt={twenty_play_cnt} WHERE out_video_id="{video_id}"; """
                     # Common.logger(log_type, crawler).info(f"update_sql:{update_sql}")
                     MysqlHelper.update_values(log_type, crawler, update_sql, env)
-                    cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint, env)
+                    cls.download_publish(log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint,
+                                         env)
                 else:
                     pass
         except Exception as e:
@@ -526,9 +534,11 @@ class XiaoniangaoHour:
     @classmethod
     def download(cls, log_type, crawler, video_info_dict, strategy, oss_endpoint, env):
         # 下载封面
-        Common.download_method(log_type=log_type, crawler=crawler, text="cover", title=video_info_dict["video_title"], url=video_info_dict["cover_url"])
+        Common.download_method(log_type=log_type, crawler=crawler, text="cover", title=video_info_dict["video_title"],
+                               url=video_info_dict["cover_url"])
         # 下载视频
-        Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_info_dict["video_title"], url=video_info_dict["video_url"])
+        Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_info_dict["video_title"],
+                               url=video_info_dict["video_url"])
         # 保存视频信息至 "./videos/{download_video_title}/info.txt"
         Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_info_dict)
 
@@ -621,30 +631,36 @@ class XiaoniangaoHour:
     def download_publish(cls, log_type, crawler, video_info_dict, update_video_info, strategy, oss_endpoint, env):
         # try:
         if cls.repeat_video(log_type, crawler, video_info_dict["video_id"], env) != 0:
-                Common.logger(log_type, crawler).info('视频已下载\n')
+            Common.logger(log_type, crawler).info('视频已下载\n')
         # 播放量大于 50000,直接下载
         elif int(video_info_dict["play_cnt"]) >= 50000:
-            Common.logger(log_type, crawler).info(f"播放量:{video_info_dict['play_cnt']} >= 50000,满足下载规则,开始下载视频")
+            Common.logger(log_type, crawler).info(
+                f"播放量:{video_info_dict['play_cnt']} >= 50000,满足下载规则,开始下载视频")
             cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
 
         # 上升榜判断逻辑,任意时间段上升量>=5000,连续两个时间段上升量>=2000
-        elif int(update_video_info['ten_play_cnt']) >= 5000 or int(update_video_info['fifteen_play_cnt']) >= 5000 or int(update_video_info['twenty_play_cnt']) >= 5000:
-            Common.logger(log_type, crawler).info(f"10:00 or 15:00 or 20:00 数据上升量:{int(update_video_info['ten_play_cnt'])} or {int(update_video_info['fifteen_play_cnt'])} or {int(update_video_info['twenty_play_cnt'])} >= 5000")
+        elif int(update_video_info['ten_play_cnt']) >= 5000 or int(
+                update_video_info['fifteen_play_cnt']) >= 5000 or int(update_video_info['twenty_play_cnt']) >= 5000:
+            Common.logger(log_type, crawler).info(
+                f"10:00 or 15:00 or 20:00 数据上升量:{int(update_video_info['ten_play_cnt'])} or {int(update_video_info['fifteen_play_cnt'])} or {int(update_video_info['twenty_play_cnt'])} >= 5000")
             Common.logger(log_type, crawler).info("满足下载规则,开始下载视频")
             cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
 
         elif int(update_video_info['ten_play_cnt']) >= 2000 and int(update_video_info['fifteen_play_cnt']) >= 2000:
-            Common.logger(log_type, crawler).info(f"10:00 and 15:00 数据上升量:{int(update_video_info['ten_play_cnt'])} and {int(update_video_info['fifteen_play_cnt'])} >= 2000")
+            Common.logger(log_type, crawler).info(
+                f"10:00 and 15:00 数据上升量:{int(update_video_info['ten_play_cnt'])} and {int(update_video_info['fifteen_play_cnt'])} >= 2000")
             Common.logger(log_type, crawler).info("满足下载规则,开始下载视频")
             cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
 
         elif int(update_video_info['fifteen_play_cnt']) >= 2000 and int(update_video_info['twenty_play_cnt']) >= 2000:
-            Common.logger(log_type, crawler).info(f"15:00 and 20:00 数据上升量:{int(update_video_info['fifteen_play_cnt'])} and {int(update_video_info['twenty_play_cnt'])} >= 2000")
+            Common.logger(log_type, crawler).info(
+                f"15:00 and 20:00 数据上升量:{int(update_video_info['fifteen_play_cnt'])} and {int(update_video_info['twenty_play_cnt'])} >= 2000")
             Common.logger(log_type, crawler).info("满足下载规则,开始下载视频")
             cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
 
         elif int(update_video_info['ten_play_cnt']) >= 2000 and int(update_video_info['twenty_play_cnt']) >= 2000:
-            Common.logger(log_type, crawler).info(f"今日10:00 / 20:00数据上升量:{int(update_video_info['ten_play_cnt'])} and {int(update_video_info['twenty_play_cnt'])} >= 2000")
+            Common.logger(log_type, crawler).info(
+                f"今日10:00 / 20:00数据上升量:{int(update_video_info['ten_play_cnt'])} and {int(update_video_info['twenty_play_cnt'])} >= 2000")
             Common.logger(log_type, crawler).info("满足下载规则,开始下载视频")
             cls.download(log_type, crawler, video_info_dict, strategy, oss_endpoint, env)
 
@@ -660,4 +676,4 @@ if __name__ == "__main__":
     # XiaoniangaoHour.get_videoList("test", "xiaoniangao", "dev")
     # XiaoniangaoHour.update_videoList("test", "xiaoniangao", "小时榜爬虫策略", "out", "dev")
 
-    pass
+    pass

+ 1 - 1
xiaoniangao/xiaoniangao_play/xiaoniangao_play.py

@@ -175,7 +175,7 @@ class XiaoniangaoPlay:
                             .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
                             .replace(":", "").replace("*", "").replace("?", "") \
                             .replace("?", "").replace('"', "").replace("<", "") \
-                            .replace(">", "").replace("|", "").replace(" ", "").replace("#表情", "").replace("#符号", "")
+                            .replace(">", "").replace("|", "").replace(" ", "").replace("#表情", "").replace("#符号", "").replace('"' ,'').replace("'", '')
 
                         expression = cls.get_expression()
                         expression_list = expression[0]

+ 1 - 1
xigua/xigua_follow/xigua_follow.py

@@ -720,7 +720,7 @@ class Follow:
                             video_title = 0
                         else:
                             video_title = videoList[i]['title'].strip().replace('手游', '') \
-                                .replace('/', '').replace('\/', '').replace('\n', '')
+                                .replace('/', '').replace('\/', '').replace('\n', '').replace('"' ,'').replace("'", '')
 
                         # video_id
                         if 'video_id' not in videoList[i]:

+ 1 - 1
xigua/xigua_recommend/xigua_recommend.py

@@ -685,7 +685,7 @@ class XiguaRecommend:
                     if 'data' not in videoList[i]:
                         continue
                     # video_title
-                    video_title = videoList[i]['data'].get('title', '')
+                    video_title = videoList[i]['data'].get('title', '').replace('"' ,'').replace("'", '')
                     if video_title == '':
                         video_title = random.choice(cls.xigua_config(log_type, crawler, "title", env))
                     # video_id

+ 1 - 1
youtube/youtube_follow/youtube_follow_api.py

@@ -922,7 +922,7 @@ class YoutubeFollow:
                 if 'title' not in videoDetails:
                     video_title = ''
                 else:
-                    video_title = videoDetails['title']
+                    video_title = videoDetails['title'].replace('"' ,'').replace("'", '')
                 video_title = cls.filter_emoji(video_title)
                 if not cls.is_contain_chinese(video_title):
                     video_title = Translate.google_translate(video_title, machine) \

この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません