Przeglądaj źródła

Merge remote-tracking branch 'origin/master'

piaoquan 1 rok temu
rodzic
commit
bfada20074

+ 3 - 2
common/feishu.py

@@ -147,8 +147,9 @@ class Feishu:
         :return:
         """
         url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
-        post_data = {"app_id": "cli_a13ad2afa438d00b",  # 这里账号密码是发布应用的后台账号及密码
-                     "app_secret": "4tK9LY9VbiQlY5umhE42dclBFo6t4p5O"}
+        post_data = {
+            "app_id": "cli_a13ad2afa438d00b",  # 这里账号密码是发布应用的后台账号及密码
+            "app_secret": "4tK9LY9VbiQlY5umhE42dclBFo6t4p5O"}
 
         try:
             urllib3.disable_warnings()

+ 29 - 22
douyin/douyin_author/douyin_author_scheduling_new.py

@@ -55,11 +55,33 @@ class DouyinauthorScheduling:
         mq = MQ(topic_name="topic_crawler_etl_" + env)
         next_cursor = 0
         while True:
+            flag = user_dict["link"].split("_")[0]
+            if flag == "V1":
+                rule_dict = {
+                    "like_cnt": {"min": 10000, "max": 0},
+                    'period': {"min": 90, "max": 90},
+                    'special': 0.01
+                }
+            elif flag == "V2":
+                rule_dict = {
+                    "like_cnt": {"min": 2000, "max": 0},
+                    'period': {"min": 90, "max": 90},
+                    'special': 0.01
+                }
+            elif flag == "V3":
+                rule_dict = {
+                    "like_cnt": {"min": 100, "max": 0},
+                    'period': {"min": 90, "max": 90},
+                    'special': 0.01
+                }
             cookie = cls.get_cookie(log_type, crawler, env)["cookie"]
-
+            if user_dict['link'][0] == "V":
+                link = user_dict["link"][3:]
+            else:
+                link = user_dict["link"]
             time.sleep(random.randint(5, 10))
             url = 'https://www.douyin.com/aweme/v1/web/aweme/post/'
-            account_id = user_dict["link"]
+            account_id = link
             headers = {
                 'Accept': 'application/json, text/plain, */*',
                 'Accept-Language': 'zh-CN,zh;q=0.9',
@@ -144,33 +166,18 @@ class DouyinauthorScheduling:
                         comment_count = int(data[i].get('statistics').get('comment_count'))  # 评论
                         # collect_count = data[i].get('statistics').get('collect_count')  # 收藏
                         share_count = int(data[i].get('statistics').get('share_count'))  # 转发
-                        date_three_days_ago_string = (date.today() + timedelta(days=-5)).strftime("%Y-%m-%d %H:%M:%S")
-                        rule = publish_time_str > date_three_days_ago_string
-                        if i > 2:
-                            if rule == False:
-                                break
-                        if rule == False:
-                            Common.logger(log_type, crawler).info(f"发布时间小于5天,发布时间:{publish_time_str}\n")
+                        video_percent = '%.2f' % (share_count / digg_count)
+                        special = float(rule_dict.get("special"))
+                        if float(video_percent) < special:
+                            Common.logger(log_type, crawler).info(f"不符合条件:分享/点赞-{video_percent}\n")
                             AliyunLogger.logging(
                                 code="2004",
                                 platform=crawler,
                                 mode=log_type,
                                 env=env,
-                                message=f"发布时间小于5天,发布时间:{publish_time_str}\n"
+                                message=f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n"
                             )
                             continue
-                        video_percent = '%.2f' % (share_count / digg_count)
-                        if digg_count < 50000 and digg_count < 50:
-                            if float(video_percent) < 0.01:
-                                Common.logger(log_type, crawler).info(f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n")
-                                AliyunLogger.logging(
-                                    code="2004",
-                                    platform=crawler,
-                                    mode=log_type,
-                                    env=env,
-                                    message=f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n"
-                                )
-                                continue
                         video_dict = {'video_title': video_title,
                                       'video_id': video_id,
                                       'play_cnt': 0,

+ 32 - 24
kuaishou/kuaishou_author/kuaishou_author_scheduling_new.py

@@ -81,12 +81,35 @@ class KuaishouauthorScheduling:
         pcursor = ""
         mq = MQ(topic_name="topic_crawler_etl_" + env)
         while True:
+            flag = user_dict["link"].split("_")[0]
+            if flag == "V1":
+                rule_dict = {
+                    "play_cnt": {"min": 10000, "max": 0},
+                    'period': {"min": 90, "max": 90},
+                    'special': 0.01
+                }
+            elif flag == "V2":
+                rule_dict = {
+                    "play_cnt": {"min": 2000, "max": 0},
+                    'period': {"min": 90, "max": 90},
+                    'special': 0.01
+                }
+            elif flag == "V3":
+                rule_dict = {
+                    "play_cnt": {"min": 100, "max": 0},
+                    'period': {"min": 90, "max": 90},
+                    'special': 0.01
+                }
             time.sleep(random.randint(10, 50))
             url = "https://www.kuaishou.com/graphql"
+            if user_dict['link'][0] == "V":
+                link = user_dict["link"][3:]
+            else:
+                link = user_dict["link"]
             payload = json.dumps({
                 "operationName": "visionProfilePhotoList",
                 "variables": {
-                    "userId": user_dict["link"].replace("https://www.kuaishou.com/profile/", ""),
+                    "userId": str(link.replace("https://www.kuaishou.com/profile/", "")),
                     "pcursor": pcursor,
                     "page": "profile"
                 },
@@ -102,7 +125,7 @@ class KuaishouauthorScheduling:
                 'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
                 'Host': 'www.kuaishou.com',
                 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15',
-                'Referer': f'https://www.kuaishou.com/profile/{user_dict["link"].replace("https://www.kuaishou.com/profile/", "")}',
+                'Referer': f'https://www.kuaishou.com/profile/{link.replace("https://www.kuaishou.com/profile/", "")}',
                 'Accept-Encoding': 'gzip, deflate, br',
                 'Connection': 'keep-alive'
             }
@@ -198,35 +221,20 @@ class KuaishouauthorScheduling:
                         video_height = feeds[i].get("photo", {}).get("videoResource").get("hevc", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("height", 0)
                     publish_time_stamp = int(int(feeds[i].get('photo', {}).get('timestamp', 0)) / 1000)
                     publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
-                    date_three_days_ago_string = (date.today() + timedelta(days=-5)).strftime("%Y-%m-%d %H:%M:%S")
-                    rule = publish_time_str > date_three_days_ago_string
-                    if i > 2:
-                        if rule == False:
-                            break
-                    if rule == False:
-                        Common.logger(log_type, crawler).info(f"发布时间小于5天,发布时间:{publish_time_str}\n")
+                    viewCount = int(feeds[i].get('photo', {}).get('viewCount', 0))
+                    realLikeCount = int(feeds[i].get('photo', {}).get('realLikeCount', 0))
+                    video_percent = '%.2f' % (realLikeCount / viewCount)
+                    special = float(rule_dict.get("special"))
+                    if float(video_percent) < special:
+                        Common.logger(log_type, crawler).info(f"不符合条件:点赞/播放-{video_percent}\n")
                         AliyunLogger.logging(
                             code="2004",
                             platform=crawler,
                             mode=log_type,
                             env=env,
-                            message=f"发布时间小于5天,发布时间:{publish_time_str}\n"
+                            message=f"点赞量:{realLikeCount}\n"
                         )
                         continue
-                    viewCount = int(feeds[i].get('photo', {}).get('viewCount', 0))
-                    realLikeCount = int(feeds[i].get('photo', {}).get('realLikeCount', 0))
-                    video_percent = '%.2f' % (realLikeCount / viewCount)
-                    if viewCount < 100000:
-                        if float(video_percent) < 0.01:
-                            Common.logger(log_type, crawler).info(f"不符合条件:点赞/播放-{video_percent},播放量-{viewCount}\n")
-                            AliyunLogger.logging(
-                                code="2004",
-                                platform=crawler,
-                                mode=log_type,
-                                env=env,
-                                message=f"点赞量:{realLikeCount}\n"
-                            )
-                            continue
                     video_dict = {'video_title': video_title,
                                   'video_id': video_id,
                                   'play_cnt': int(feeds[i].get('photo', {}).get('viewCount', 0)),

+ 17 - 2
shipinhao/shipinhao_author/shipinhao_scheduling.py

@@ -17,6 +17,9 @@ from common.public import clean_title
 
 
 def find_target_user(name, user_list):
+    """
+    在搜索到到账号列表中找目标列表
+    """
     for obj in user_list:
         if obj["nickname"] == name:
             return obj
@@ -25,8 +28,13 @@ def find_target_user(name, user_list):
     return False
 
 
-class ShiPinHaoAccount:
+class ShiPinHaoAccount(object):
+    """
+    视频号账号爬虫
+    """
     def __init__(self, platform, mode, rule_dict, user_dict, env):
+        self.cookie = None
+        self.token = None
         self.account_name = user_dict["link"]
         self.platform = platform
         self.mode = mode
@@ -37,6 +45,9 @@ class ShiPinHaoAccount:
         self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
 
     def get_token_from_mysql(self):
+        """
+        从mysql中读取token和cookie
+        """
         select_sql = (
             f"""SELECT config from crawler_config where source = '{self.platform}'; """
         )
@@ -71,7 +82,9 @@ class ShiPinHaoAccount:
             return False
 
     def get_account_id(self):
-        # 读历史数据,如果存在 id,则直接返回 id
+        """
+        读历史数据,如果存在 id,则直接返回 id
+        """
         history_id = self.get_history_id()
         if history_id:
             return history_id
@@ -119,7 +132,9 @@ class ShiPinHaoAccount:
                 return False
 
     def get_account_videos(self):
+        """
         # 一个账号最多抓取 30 条数据
+        """
         user_id = self.get_account_id()
         if user_id:
             url = "https://mp.weixin.qq.com/cgi-bin/videosnap"

Plik diff jest za duży
+ 30 - 14
test.py


+ 0 - 0
xiaoniangao/xiaoniangao_account_scan.py


+ 38 - 31
xiaoniangao/xiaoniangao_author/xiaoniangao_author_v2.py

@@ -9,6 +9,7 @@ import requests
 from common.mq import MQ
 
 sys.path.append(os.getcwd())
+
 from common.common import Common
 from common import AliyunLogger, PiaoQuanPipeline
 from common.public import get_config_from_mysql, clean_title
@@ -40,6 +41,8 @@ class XiaoNianGaoAuthor:
         self.user_list = user_list
         self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
         self.download_count = 0
+        self.test_account = [58528285, 58527674, 58528085, 58527582, 58527601, 58527612, 58528281, 58528095, 58527323,
+                             58528071, 58527278]
 
     def get_author_list(self):
         # 每轮只抓取定量的数据,到达数量后自己退出
@@ -242,22 +245,6 @@ class XiaoNianGaoAuthor:
             "strategy": self.mode,
             "out_video_id": video_obj.get("vid", ""),
         }
-        if (
-                int(time.time()) - publish_time_stamp
-                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
-        ):
-            AliyunLogger.logging(
-                code="2004",
-                trace_id=trace_id,
-                platform=self.platform,
-                mode=self.mode,
-                env=self.env,
-                data=video_dict,
-                message="发布时间超过{}天".format(
-                    int(self.rule_dict.get("period", {}).get("max", 1000))
-                ),
-            )
-            return False
         pipeline = PiaoQuanPipeline(
             platform=self.platform,
             mode=self.mode,
@@ -266,11 +253,41 @@ class XiaoNianGaoAuthor:
             item=video_dict,
             trace_id=trace_id,
         )
-        account_level = user_dict['account_level']
-        # if account_level == "P0" or account_level == "P1":
-        #     flag = True
-        # else:
-        flag = pipeline.process_item()
+        # account_level = user_dict['account_level']
+        if user_dict['uid'] in self.test_account:
+            if (
+                    int(time.time()) - publish_time_stamp
+                    > 3600 * 24
+            ):
+                AliyunLogger.logging(
+                    code="2004",
+                    trace_id=trace_id,
+                    platform=self.platform,
+                    mode=self.mode,
+                    env=self.env,
+                    data=video_dict,
+                    message="发布时间超过1天"
+                )
+                return False
+            flag = pipeline.repeat_video()
+        else:
+            if (
+                    int(time.time()) - publish_time_stamp
+                    > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
+            ):
+                AliyunLogger.logging(
+                    code="2004",
+                    trace_id=trace_id,
+                    platform=self.platform,
+                    mode=self.mode,
+                    env=self.env,
+                    data=video_dict,
+                    message="发布时间超过{}天".format(
+                        int(self.rule_dict.get("period", {}).get("max", 1000))
+                    ),
+                )
+                return False
+            flag = pipeline.process_item()
         if flag:
             video_dict["width"] = video_dict["video_width"]
             video_dict["height"] = video_dict["video_height"]
@@ -296,13 +313,3 @@ class XiaoNianGaoAuthor:
                 message="成功发送 MQ 至 ETL",
             )
         return True
-
-# if __name__ == "__main__":
-#     XNGA = XiaoNianGaoAuthor(
-#         platform="xiaoniangao",
-#         mode="author",
-#         rule_dict={},
-#         env="prod",
-#         user_list=[{"link": 295640510, "uid": "12334"}],
-#     )
-#     XNGA.get_author_list()

+ 128 - 71
xigua/xigua_author/xigua_author.py

@@ -12,27 +12,14 @@ from fake_useragent import FakeUserAgent
 from common.mq import MQ
 
 sys.path.append(os.getcwd())
-from common import AliyunLogger, PiaoQuanPipeline
 
-
-def tunnel_proxies():
-    # 隧道域名:端口号
-    tunnel = "q796.kdltps.com:15818"
-
-    # 用户名密码方式
-    username = "t17772369458618"
-    password = "5zqcjkmy"
-    tunnel_proxies = {
-        "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
-                % {"user": username, "pwd": password, "proxy": tunnel},
-        "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
-                 % {"user": username, "pwd": password, "proxy": tunnel},
-    }
-
-    return tunnel_proxies
+from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
 
 
 def random_signature():
+    """
+    随机生成签名
+    """
     src_digits = string.digits  # string_数字
     src_uppercase = string.ascii_uppercase  # string_大写字母
     src_lowercase = string.ascii_lowercase  # string_小写字母
@@ -62,6 +49,9 @@ def random_signature():
 
 
 def get_video_url(video_info):
+    """
+    获取视频的链接
+    """
     video_url_dict = {}
     # video_url
     if "videoResource" not in video_info:
@@ -599,6 +589,9 @@ def get_video_url(video_info):
 
 
 def get_comment_cnt(item_id):
+    """
+    获取视频的评论数量
+    """
     url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
     params = {
         "tab_index": "0",
@@ -643,6 +636,9 @@ def get_comment_cnt(item_id):
 
 
 class XiGuaAuthor:
+    """
+    西瓜账号爬虫
+    """
     def __init__(self, platform, mode, rule_dict, env, user_list):
         self.platform = platform
         self.mode = mode
@@ -652,12 +648,56 @@ class XiGuaAuthor:
         self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
         self.download_count = 0
 
+    def rule_maker(self, account):
+        """
+        通过不同的账号生成不同的规则
+        :param account: 输入的账号信息
+        {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
+        """
+        flag = account['link'].split("_")[0]
+        if flag == "V1":
+            rule_dict = {
+                "play_cnt": {"min": 100000, "max": 0},
+                'period': {"min": 90, "max": 90},
+                'special': 0.02
+            }
+            return rule_dict
+        elif flag == "V2":
+            rule_dict = {
+                "play_cnt": {"min": 10000, "max": 0},
+                'period': {"min": 90, "max": 90},
+                'special': 0.01
+            }
+            return rule_dict
+        elif flag == "V3":
+            rule_dict = {
+                "play_cnt": {"min": 5000, "max": 0},
+                'period': {"min": 90, "max": 90},
+                'special': 0.01
+            }
+            return rule_dict
+        else:
+            return self.rule_dict
+
     def get_author_list(self):
-        # 每轮只抓取定量的数据,到达数量后自己退出
+        """
+        每轮只抓取定量的数据,到达数量后自己退出
+        获取账号列表以及账号信息
+        """
         # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
         for user_dict in self.user_list:
             # if self.download_count <= max_count:
-            self.get_video_list(user_dict)
+            try:
+                self.get_video_list(user_dict)
+            except Exception as e:
+                AliyunLogger.logging(
+                    code="3001",
+                    account=user_dict["uid"],
+                    platform=self.platform,
+                    mode=self.mode,
+                    env=self.env,
+                    message="扫描账号时出现bug, 报错是 {}".format(e)
+                )
             #     time.sleep(random.randint(1, 15))
             # else:
             #     AliyunLogger.logging(
@@ -670,14 +710,20 @@ class XiGuaAuthor:
             #     return
 
     def get_video_list(self, user_dict):
+        """
+        获取某个账号的视频列表
+        """
         offset = 0
         signature = random_signature()
         url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
         while True:
+            if user_dict['link'][0] == "V":
+                link = user_dict["link"][3:]
+            else:
+                link = user_dict["link"]
+            to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
             params = {
-                "to_user_id": str(
-                    user_dict["link"].replace("https://www.ixigua.com/home/", "")
-                ),
+                "to_user_id": to_user_id,
                 "offset": str(offset),
                 "limit": "30",
                 "maxBehotTime": "0",
@@ -688,7 +734,7 @@ class XiGuaAuthor:
                 "_signature": signature,
             }
             headers = {
-                "referer": f'https://www.ixigua.com/home/{user_dict["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
+                "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
                 "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
             }
             response = requests.get(
@@ -723,6 +769,7 @@ class XiGuaAuthor:
                     try:
                         AliyunLogger.logging(
                             code="1001",
+                            account=user_dict['uid'],
                             platform=self.platform,
                             mode=self.mode,
                             env=self.env,
@@ -743,11 +790,13 @@ class XiGuaAuthor:
                         )
 
     def process_video_obj(self, video_obj, user_dict):
+        new_rule = self.rule_maker(user_dict)
         trace_id = self.platform + str(uuid.uuid1())
         item_id = video_obj.get("item_id", "")
         if not item_id:
             AliyunLogger.logging(
                 code="2005",
+                account=user_dict['uid'],
                 platform=self.platform,
                 mode=self.mode,
                 env=self.env,
@@ -764,32 +813,33 @@ class XiGuaAuthor:
         video_dict["out_video_id"] = video_dict["video_id"]
         video_dict["width"] = video_dict["video_width"]
         video_dict["height"] = video_dict["video_height"]
-        video_dict["crawler_rule"] = json.dumps(self.rule_dict)
+        video_dict["crawler_rule"] = json.dumps(new_rule)
         video_dict["user_id"] = user_dict["uid"]
         video_dict["publish_time"] = video_dict["publish_time_str"]
         video_dict["strategy_type"] = self.mode
         video_dict["update_time_stamp"] = int(time.time())
-        if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000)):
+        if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
+                new_rule.get("period", {}).get("max", 1000)):
             if not video_obj['is_top']:
                 """
                 非置顶数据发布时间超过才退出
                 """
                 AliyunLogger.logging(
                     code="2004",
+                    account=user_dict['uid'],
                     platform=self.platform,
                     mode=self.mode,
                     env=self.env,
                     data=video_dict,
                     message="发布时间超过{}天".format(
-                        int(self.rule_dict.get("period", {}).get("max", 1000))
+                        int(new_rule.get("period", {}).get("max", 1000))
                     ),
                 )
                 return False
-
         pipeline = PiaoQuanPipeline(
             platform=self.platform,
             mode=self.mode,
-            rule_dict=self.rule_dict,
+            rule_dict=new_rule,
             env=self.env,
             item=video_dict,
             trace_id=trace_id,
@@ -797,50 +847,57 @@ class XiGuaAuthor:
         title_flag = pipeline.title_flag()
         repeat_flag = pipeline.repeat_video()
         if title_flag and repeat_flag:
-            if int(video_dict['play_cnt']) >= int(self.rule_dict.get("play_cnt", {}).get("min", 100000)):
-                self.mq.send_msg(video_dict)
-                self.download_count += 1
-                AliyunLogger.logging(
-                    code="1002",
-                    platform=self.platform,
-                    mode=self.mode,
-                    env=self.env,
-                    data=video_dict,
-                    trace_id=trace_id,
-                    message="成功发送 MQ 至 ETL",
-                )
-                return True
-            else:
-                AliyunLogger.logging(
-                    code="2008",
-                    platform=self.platform,
-                    mode=self.mode,
-                    env=self.env,
-                    message="不满足特殊规则, 播放量",
-                    data=video_dict
-                )
-            if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= 0.04:
-                self.mq.send_msg(video_dict)
-                self.download_count += 1
-                AliyunLogger.logging(
-                    code="1002",
-                    platform=self.platform,
-                    mode=self.mode,
-                    env=self.env,
-                    data=video_dict,
-                    trace_id=trace_id,
-                    message="成功发送 MQ 至 ETL",
-                )
-                return True
+            if new_rule.get("special"):
+                if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
+                    if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
+                        self.mq.send_msg(video_dict)
+                        self.download_count += 1
+                        AliyunLogger.logging(
+                            code="1002",
+                            account=user_dict['uid'],
+                            platform=self.platform,
+                            mode=self.mode,
+                            env=self.env,
+                            data=video_dict,
+                            trace_id=trace_id,
+                            message="成功发送 MQ 至 ETL",
+                        )
+                        return True
+                    else:
+                        AliyunLogger.logging(
+                            code="2008",
+                            account=user_dict['uid'],
+                            platform=self.platform,
+                            mode=self.mode,
+                            env=self.env,
+                            message="不满足特殊规则, 点赞量/播放量",
+                            data=video_dict
+                        )
             else:
-                AliyunLogger.logging(
-                    code="2008",
-                    platform=self.platform,
-                    mode=self.mode,
-                    env=self.env,
-                    message="不满足特殊规则, 点赞量/播放量",
-                    data=video_dict
-                )
+                if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
+                    self.mq.send_msg(video_dict)
+                    self.download_count += 1
+                    AliyunLogger.logging(
+                        code="1002",
+                        account=user_dict['uid'],
+                        platform=self.platform,
+                        mode=self.mode,
+                        env=self.env,
+                        data=video_dict,
+                        trace_id=trace_id,
+                        message="成功发送 MQ 至 ETL",
+                    )
+                    return True
+                else:
+                    AliyunLogger.logging(
+                        code="2008",
+                        account=user_dict['uid'],
+                        platform=self.platform,
+                        mode=self.mode,
+                        env=self.env,
+                        message="不满足特殊规则, 播放量",
+                        data=video_dict
+                    )
         return True
 
     def get_video_info(self, item_id, trace_id):

+ 0 - 968
xigua/xigua_author/xigua_author_test.py

@@ -1,968 +0,0 @@
-import json
-import re
-import os
-import random
-import sys
-import string
-import time
-import uuid
-import base64
-import requests
-from fake_useragent import FakeUserAgent
-
-sys.path.append(os.getcwd())
-
-
-class PiaoQuanPipelineTest:
-    def __init__(self, platform, mode, rule_dict, env, item, trace_id):
-        self.platform = platform
-        self.mode = mode
-        self.item = item
-        self.rule_dict = rule_dict
-        self.env = env
-        self.trace_id = trace_id
-
-    # 视频的发布时间限制, 属于是规则过滤
-    def publish_time_flag(self):
-        # 判断发布时间
-        publish_time_stamp = self.item["publish_time_stamp"]
-        update_time_stamp = self.item["update_time_stamp"]
-        if self.platform == "gongzhonghao":
-            if (
-                int(time.time()) - publish_time_stamp
-                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
-            ) and (
-                int(time.time()) - update_time_stamp
-                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
-            ):
-                message = "发布时间超过{}天".format(
-                    int(self.rule_dict.get("period", {}).get("max", 1000))
-                )
-                print(message)
-                return False
-        else:
-            if (
-                int(time.time()) - publish_time_stamp
-                > 3600 * 24 * int(self.rule_dict.get("period", {}).get("max", 1000))
-            ):
-                message = "发布时间超过{}天".format(
-                    int(self.rule_dict.get("period", {}).get("max", 1000))
-                )
-                print(message)
-                return False
-        return True
-
-    # 视频标题是否满足需求
-    def title_flag(self):
-        title = self.item["video_title"]
-        cleaned_title = re.sub(r"[^\w]", " ", title)
-        # 敏感词
-        # 获取敏感词列表
-        sensitive_words = []
-        if any(word in cleaned_title for word in sensitive_words):
-            message = "标题中包含敏感词"
-            print(message)
-            return False
-        return True
-
-    # 视频基础下载规则
-    def download_rule_flag(self):
-        for key in self.item:
-            if self.rule_dict.get(key):
-                max_value = (
-                    int(self.rule_dict[key]["max"])
-                    if int(self.rule_dict[key]["max"]) > 0
-                    else 999999999999999
-                )
-                if key == "peroid": # peroid是抓取周期天数
-                    continue
-                else:
-                    flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
-                    if not flag:
-                        message = "{}: {} <= {} <= {}, {}".format(
-                            key,
-                            self.rule_dict[key]["min"],
-                            self.item[key],
-                            max_value,
-                            flag,
-                        )
-                        print(message)
-                        return flag
-            else:
-                continue
-        return True
-
-    # 按照某个具体平台来去重
-    # def repeat_video(self):
-    #     # sql = f""" select * from crawler_video where platform="公众号" and out_video_id="{video_id}"; """
-    #     out_id = self.item["out_video_id"]
-    #     sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
-    #     repeat_video = MysqlHelper.get_values(
-    #         log_type=self.mode, crawler=self.platform, env=self.env, sql=sql, action=""
-    #     )
-    #     if repeat_video:
-    #         message = "重复的视频"
-    #         return False
-    #     return True
-
-    def process_item(self):
-        if not self.publish_time_flag():
-            # 记录相关日志
-            return False
-        if not self.title_flag():
-            # 记录相关日志
-            return False
-        # if not self.repeat_video():
-        #     # 记录相关日志
-        #     return False
-        if not self.download_rule_flag():
-            # 记录相关日志
-            return False
-        return True
-
-
-def tunnel_proxies():
-    # 隧道域名:端口号
-    tunnel = "q796.kdltps.com:15818"
-
-    # 用户名密码方式
-    username = "t17772369458618"
-    password = "5zqcjkmy"
-    tunnel_proxies = {
-        "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
-                % {"user": username, "pwd": password, "proxy": tunnel},
-        "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
-                 % {"user": username, "pwd": password, "proxy": tunnel},
-    }
-
-    return tunnel_proxies
-
-
-def random_signature():
-    src_digits = string.digits  # string_数字
-    src_uppercase = string.ascii_uppercase  # string_大写字母
-    src_lowercase = string.ascii_lowercase  # string_小写字母
-    digits_num = random.randint(1, 6)
-    uppercase_num = random.randint(1, 26 - digits_num - 1)
-    lowercase_num = 26 - (digits_num + uppercase_num)
-    password = (
-            random.sample(src_digits, digits_num)
-            + random.sample(src_uppercase, uppercase_num)
-            + random.sample(src_lowercase, lowercase_num)
-    )
-    random.shuffle(password)
-    new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
-    new_password_start = new_password[0:18]
-    new_password_end = new_password[-7:]
-    if new_password[18] == "8":
-        new_password = new_password_start + "w" + new_password_end
-    elif new_password[18] == "9":
-        new_password = new_password_start + "x" + new_password_end
-    elif new_password[18] == "-":
-        new_password = new_password_start + "y" + new_password_end
-    elif new_password[18] == ".":
-        new_password = new_password_start + "z" + new_password_end
-    else:
-        new_password = new_password_start + "y" + new_password_end
-    return new_password
-
-
-def get_video_url(video_info):
-    video_url_dict = {}
-    # video_url
-    if "videoResource" not in video_info:
-        video_url_dict["video_url"] = ""
-        video_url_dict["audio_url"] = ""
-        video_url_dict["video_width"] = 0
-        video_url_dict["video_height"] = 0
-
-    elif "dash_120fps" in video_info["videoResource"]:
-        if (
-                "video_list" in video_info["videoResource"]["dash_120fps"]
-                and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_4"
-            ]["backup_url_1"]
-            audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_4"
-            ]["backup_url_1"]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_4"
-            ]["vwidth"]
-            video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_4"
-            ]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["dash_120fps"]
-                and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_3"
-            ]["backup_url_1"]
-            audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_3"
-            ]["backup_url_1"]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_3"
-            ]["vwidth"]
-            video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_3"
-            ]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["dash_120fps"]
-                and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_2"
-            ]["backup_url_1"]
-            audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_2"
-            ]["backup_url_1"]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_2"
-            ]["vwidth"]
-            video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_2"
-            ]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["dash_120fps"]
-                and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_1"
-            ]["backup_url_1"]
-            audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_1"
-            ]["backup_url_1"]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_1"
-            ]["vwidth"]
-            video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
-                "video_1"
-            ]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-
-        elif (
-                "dynamic_video" in video_info["videoResource"]["dash_120fps"]
-                and "dynamic_video_list"
-                in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
-                and "dynamic_audio_list"
-                in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
-                and len(
-            video_info["videoResource"]["dash_120fps"]["dynamic_video"][
-                "dynamic_video_list"
-            ]
-        )
-                != 0
-                and len(
-            video_info["videoResource"]["dash_120fps"]["dynamic_video"][
-                "dynamic_audio_list"
-            ]
-        )
-                != 0
-        ):
-            video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["backup_url_1"]
-            audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
-                "dynamic_audio_list"
-            ][-1]["backup_url_1"]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["vwidth"]
-            video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        else:
-            video_url_dict["video_url"] = ""
-            video_url_dict["audio_url"] = ""
-            video_url_dict["video_width"] = 0
-            video_url_dict["video_height"] = 0
-
-    elif "dash" in video_info["videoResource"]:
-        if (
-                "video_list" in video_info["videoResource"]["dash"]
-                and "video_4" in video_info["videoResource"]["dash"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
-                "backup_url_1"
-            ]
-            audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
-                "backup_url_1"
-            ]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
-                "vwidth"
-            ]
-            video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
-                "vheight"
-            ]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["dash"]
-                and "video_3" in video_info["videoResource"]["dash"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
-                "backup_url_1"
-            ]
-            audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
-                "backup_url_1"
-            ]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
-                "vwidth"
-            ]
-            video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
-                "vheight"
-            ]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["dash"]
-                and "video_2" in video_info["videoResource"]["dash"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
-                "backup_url_1"
-            ]
-            audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
-                "backup_url_1"
-            ]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
-                "vwidth"
-            ]
-            video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
-                "vheight"
-            ]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["dash"]
-                and "video_1" in video_info["videoResource"]["dash"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
-                "backup_url_1"
-            ]
-            audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
-                "backup_url_1"
-            ]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
-                "vwidth"
-            ]
-            video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
-                "vheight"
-            ]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-
-        elif (
-                "dynamic_video" in video_info["videoResource"]["dash"]
-                and "dynamic_video_list"
-                in video_info["videoResource"]["dash"]["dynamic_video"]
-                and "dynamic_audio_list"
-                in video_info["videoResource"]["dash"]["dynamic_video"]
-                and len(
-            video_info["videoResource"]["dash"]["dynamic_video"][
-                "dynamic_video_list"
-            ]
-        )
-                != 0
-                and len(
-            video_info["videoResource"]["dash"]["dynamic_video"][
-                "dynamic_audio_list"
-            ]
-        )
-                != 0
-        ):
-            video_url = video_info["videoResource"]["dash"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["backup_url_1"]
-            audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
-                "dynamic_audio_list"
-            ][-1]["backup_url_1"]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["dash"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["vwidth"]
-            video_height = video_info["videoResource"]["dash"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        else:
-            video_url_dict["video_url"] = ""
-            video_url_dict["audio_url"] = ""
-            video_url_dict["video_width"] = 0
-            video_url_dict["video_height"] = 0
-
-    elif "normal" in video_info["videoResource"]:
-        if (
-                "video_list" in video_info["videoResource"]["normal"]
-                and "video_4" in video_info["videoResource"]["normal"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
-                "backup_url_1"
-            ]
-            audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
-                "backup_url_1"
-            ]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["normal"]["video_list"][
-                "video_4"
-            ]["vwidth"]
-            video_height = video_info["videoResource"]["normal"]["video_list"][
-                "video_4"
-            ]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["normal"]
-                and "video_3" in video_info["videoResource"]["normal"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
-                "backup_url_1"
-            ]
-            audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
-                "backup_url_1"
-            ]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["normal"]["video_list"][
-                "video_3"
-            ]["vwidth"]
-            video_height = video_info["videoResource"]["normal"]["video_list"][
-                "video_3"
-            ]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["normal"]
-                and "video_2" in video_info["videoResource"]["normal"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
-                "backup_url_1"
-            ]
-            audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
-                "backup_url_1"
-            ]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["normal"]["video_list"][
-                "video_2"
-            ]["vwidth"]
-            video_height = video_info["videoResource"]["normal"]["video_list"][
-                "video_2"
-            ]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        elif (
-                "video_list" in video_info["videoResource"]["normal"]
-                and "video_1" in video_info["videoResource"]["normal"]["video_list"]
-        ):
-            video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
-                "backup_url_1"
-            ]
-            audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
-                "backup_url_1"
-            ]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["normal"]["video_list"][
-                "video_1"
-            ]["vwidth"]
-            video_height = video_info["videoResource"]["normal"]["video_list"][
-                "video_1"
-            ]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-
-        elif (
-                "dynamic_video" in video_info["videoResource"]["normal"]
-                and "dynamic_video_list"
-                in video_info["videoResource"]["normal"]["dynamic_video"]
-                and "dynamic_audio_list"
-                in video_info["videoResource"]["normal"]["dynamic_video"]
-                and len(
-            video_info["videoResource"]["normal"]["dynamic_video"][
-                "dynamic_video_list"
-            ]
-        )
-                != 0
-                and len(
-            video_info["videoResource"]["normal"]["dynamic_video"][
-                "dynamic_audio_list"
-            ]
-        )
-                != 0
-        ):
-            video_url = video_info["videoResource"]["normal"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["backup_url_1"]
-            audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
-                "dynamic_audio_list"
-            ][-1]["backup_url_1"]
-            if len(video_url) % 3 == 1:
-                video_url += "=="
-            elif len(video_url) % 3 == 2:
-                video_url += "="
-            elif len(audio_url) % 3 == 1:
-                audio_url += "=="
-            elif len(audio_url) % 3 == 2:
-                audio_url += "="
-            video_url = base64.b64decode(video_url).decode("utf8")
-            audio_url = base64.b64decode(audio_url).decode("utf8")
-            video_width = video_info["videoResource"]["normal"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["vwidth"]
-            video_height = video_info["videoResource"]["normal"]["dynamic_video"][
-                "dynamic_video_list"
-            ][-1]["vheight"]
-            video_url_dict["video_url"] = video_url
-            video_url_dict["audio_url"] = audio_url
-            video_url_dict["video_width"] = video_width
-            video_url_dict["video_height"] = video_height
-        else:
-            video_url_dict["video_url"] = ""
-            video_url_dict["audio_url"] = ""
-            video_url_dict["video_width"] = 0
-            video_url_dict["video_height"] = 0
-
-    else:
-        video_url_dict["video_url"] = ""
-        video_url_dict["audio_url"] = ""
-        video_url_dict["video_width"] = 0
-        video_url_dict["video_height"] = 0
-
-    return video_url_dict
-
-
-def get_comment_cnt(item_id):
-    url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?"
-    params = {
-        "tab_index": "0",
-        "count": "10",
-        "offset": "10",
-        "group_id": str(item_id),
-        "item_id": str(item_id),
-        "aid": "1768",
-        "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==",
-        "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U",
-        "_signature": FakeUserAgent().random,
-    }
-    headers = {
-        "authority": "www.ixigua.com",
-        "accept": "application/json, text/plain, */*",
-        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
-        "cache-control": "no-cache",
-        "cookie": "MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3",
-        "pragma": "no-cache",
-        "referer": f"https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540",
-        "sec-ch-ua": '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
-        "sec-ch-ua-mobile": "?0",
-        "sec-ch-ua-platform": '"macOS"',
-        "sec-fetch-dest": "empty",
-        "sec-fetch-mode": "cors",
-        "sec-fetch-site": "same-origin",
-        "tt-anti-token": "cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422",
-        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
-        "x-secsdk-csrf-token": "000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5",
-    }
-    response = requests.get(
-        url=url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=5
-    )
-    response.close()
-    if (
-            response.status_code != 200
-            or "total_number" not in response.json()
-            or response.json() == {}
-    ):
-        return 0
-    return response.json().get("total_number", 0)
-
-
-def get_video_info(item_id, trace_id):
-    url = "https://www.ixigua.com/api/mixVideo/information?"
-    headers = {
-        "accept-encoding": "gzip, deflate",
-        "accept-language": "zh-CN,zh-Hans;q=0.9",
-        "user-agent": FakeUserAgent().random,
-        "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
-    }
-    params = {
-        "mixId": str(item_id),
-        "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
-                   "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
-        "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
-        "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
-                      "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
-    }
-    cookies = {
-        "ixigua-a-s": "1",
-        "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
-                   "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
-        "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
-                 "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
-        "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
-        "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
-        "__ac_nonce": "06304878000964fdad287",
-        "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
-                          "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
-        "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
-        "_tea_utm_cache_1300": "undefined",
-        "support_avif": "false",
-        "support_webp": "false",
-        "xiguavideopcwebid": "7134967546256016900",
-        "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
-    }
-    response = requests.get(
-        url=url,
-        headers=headers,
-        params=params,
-        cookies=cookies,
-        proxies=tunnel_proxies(),
-        timeout=5,
-    )
-    if (
-            response.status_code != 200
-            or "data" not in response.json()
-            or response.json()["data"] == {}
-    ):
-        print("获取视频信息失败")
-        return None
-    else:
-        video_info = (
-            response.json()["data"]
-            .get("gidInformation", {})
-            .get("packerData", {})
-            .get("video", {})
-        )
-        if video_info == {}:
-            return None
-        video_detail = get_video_url(video_info)
-        video_dict = {
-            "video_title": video_info.get("title", ""),
-            "video_id": video_info.get("videoResource", {}).get("vid", ""),
-            "gid": str(item_id),
-            "play_cnt": int(video_info.get("video_watch_count", 0)),
-            "like_cnt": int(video_info.get("video_like_count", 0)),
-            "comment_cnt": int(get_comment_cnt(item_id)),
-            "share_cnt": 0,
-            "favorite_cnt": 0,
-            "duration": int(video_info.get("video_duration", 0)),
-            "video_width": int(video_detail["video_width"]),
-            "video_height": int(video_detail["video_height"]),
-            "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
-            "publish_time_str": time.strftime(
-                "%Y-%m-%d %H:%M:%S",
-                time.localtime(int(video_info.get("video_publish_time", 0))),
-            ),
-            "user_name": video_info.get("user_info", {}).get("name", ""),
-            "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
-            "avatar_url": str(
-                video_info.get("user_info", {}).get("avatar_url", "")
-            ),
-            "cover_url": video_info.get("poster_url", ""),
-            "audio_url": video_detail["audio_url"],
-            "video_url": video_detail["video_url"],
-            "session": f"xigua-search-{int(time.time())}",
-        }
-        return video_dict
-
-
-class XiGuaAuthor:
-    def __init__(self, platform, mode, rule_dict, env, user_list):
-        self.platform = platform
-        self.mode = mode
-        self.rule_dict = rule_dict
-        self.env = env
-        self.user_list = user_list
-        # self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
-        self.download_count = 0
-
-    def get_author_list(self):
-        # 每轮只抓取定量的数据,到达数量后自己退出
-        max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
-        for user_dict in self.user_list:
-            self.get_video_list(user_dict)
-            if self.download_count <= max_count:
-                self.get_video_list(user_dict)
-                time.sleep(random.randint(1, 15))
-            else:
-                print("本轮已经抓取足够数量的视频,已经自动退出")
-                return
-
-    def get_video_list(self, user_dict):
-        offset = 0
-        signature = random_signature()
-        url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
-        while True:
-            params = {
-                "to_user_id": str(
-                    user_dict["link"].replace("https://www.ixigua.com/home/", "")
-                ),
-                "offset": str(offset),
-                "limit": "30",
-                "maxBehotTime": "0",
-                "order": "new",
-                "isHome": "0",
-                # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
-                # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
-                "_signature": signature,
-            }
-            headers = {
-                "referer": f'https://www.ixigua.com/home/{user_dict["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
-                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
-            }
-            response = requests.get(
-                url=url,
-                headers=headers,
-                params=params,
-                proxies=tunnel_proxies(),
-                timeout=5,
-            )
-            offset += 30
-            if "data" not in response.text or response.status_code != 200:
-                print(f"get_videoList:{response.text}\n")
-                return
-            elif not response.json()["data"]["videoList"]:
-                print(f"没有更多数据啦~\n")
-                return
-            else:
-                feeds = response.json()["data"]["videoList"]
-                for video_obj in feeds:
-                    print(video_obj['is_top'])
-                    # print(json.dumps(video_obj, ensure_ascii=False, indent=4))
-                    # return
-                    self.process_video_obj(video_obj, user_dict)
-
-                    # try:
-                    #     print("扫描到一条视频")
-                    #     self.process_video_obj(video_obj, user_dict)
-                    # except Exception as e:
-                    #     print("抓取单条视频异常, 报错原因是: {}".format(e))
-
-    def process_video_obj(self, video_obj, user_dict):
-        trace_id = self.platform + str(uuid.uuid1())
-        item_id = video_obj.get("item_id", "")
-        if not item_id:
-            print("无效视频")
-            return
-        # 获取视频信息
-        video_dict = get_video_info(item_id=item_id, trace_id=trace_id)
-        video_dict["out_user_id"] = video_dict["user_id"]
-        video_dict["platform"] = self.platform
-        video_dict["strategy"] = self.mode
-        video_dict["out_video_id"] = video_dict["video_id"]
-        video_dict["width"] = video_dict["video_width"]
-        video_dict["height"] = video_dict["video_height"]
-        video_dict["crawler_rule"] = json.dumps(self.rule_dict)
-        video_dict["user_id"] = user_dict["uid"]
-        video_dict["publish_time"] = video_dict["publish_time_str"]
-        video_dict["strategy_type"] = self.mode
-        video_dict["update_time_stamp"] = int(time.time())
-        pipeline = PiaoQuanPipelineTest(
-            platform=self.platform,
-            mode=self.mode,
-            rule_dict=self.rule_dict,
-            env=self.env,
-            item=video_dict,
-            trace_id=trace_id,
-        )
-        flag = pipeline.process_item()
-        if flag:
-            print(json.dumps(video_dict, ensure_ascii=False, indent=4))
-            # self.mq.send_msg(video_dict)
-            self.download_count += 1
-            print("成功发送 MQ 至 ETL")
-
-
-if __name__ == "__main__":
-    user_list = [
-        {
-            "uid": 6267140,
-            "source": "xigua",
-            "link": "https://www.ixigua.com/home/113976532286319/?list_entrance=anyVideo",
-            "nick_name": "云姐犹记",
-            "avatar_url": "",
-            "mode": "author",
-        }
-    ]
-    # rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100, 'max': 0}}
-    XGA = XiGuaAuthor(
-        platform="xigua",
-        mode="author",
-        rule_dict={},
-        env="prod",
-        user_list=user_list
-    )
-    XGA.get_author_list()
-    # item_id = "v0201ag10000cl4d7djc77u73eftvrcg"
-    # get_video_info(item_id=item_id, trace_id="ljh")

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików