1 năm trước cách đây · cdaeb4de69
--- a/xigua/xigua_author/xigua_author.py
+++ b/xigua/xigua_author/xigua_author.py
@@ -1,5 +1,6 @@
 
				 import json
			
 
				 import os
			
 
				+import re
			
 
				 import random
			
 
				 import sys
			
 
				 import string
			
@@ -17,6 +18,59 @@ from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
 
				 from common.limit import AuthorLimit
			
 
				 
			
 
				 
			
 
				+def extract_info_by_re(text):
			
 
				+    """
			
 
				+    通过正则表达式获取文本中的信息
			
 
				+    :param text:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 标题
			
 
				+    title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
			
 
				+    if title_match:
			
 
				+        title_content = title_match.group(1)
			
 
				+        title_content = title_content.split(" - ")[0]
			
 
				+        title_content = bytes(title_content, "latin1").decode()
			
 
				+    else:
			
 
				+        title_content = ""
			
 
				+    # video_url
			
 
				+    main_url = re.search(r'("main_url":")(.*?)"', text)[0]
			
 
				+    main_url = main_url.split(":")[1]
			
 
				+    decoded_data = base64.b64decode(main_url)
			
 
				+    try:
			
 
				+        # 尝试使用utf-8解码
			
 
				+        video_url = decoded_data.decode()
			
 
				+    except UnicodeDecodeError:
			
 
				+        # 如果utf-8解码失败，尝试使用其他编码方式
			
 
				+        video_url = decoded_data.decode('latin-1')
			
 
				+
			
 
				+    # video_id
			
 
				+    video_id = re.search(r'"vid":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # like_count
			
 
				+    like_count = re.search(r'"video_like_count":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # cover_url
			
 
				+    cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # video_play
			
 
				+    video_watch_count = re.search(r'"video_watch_count":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # "video_publish_time"
			
 
				+    publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # video_duration
			
 
				+    duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
			
 
				+    return {
			
 
				+        "title": title_content,
			
 
				+        "url": video_url,
			
 
				+        "video_id": video_id,
			
 
				+        "like_count": like_count,
			
 
				+        "cover_url": cover_url,
			
 
				+        "play_count": video_watch_count,
			
 
				+        "publish_time": publish_time,
			
 
				+        "duration": duration
			
 
				+    }
			
 
				+
			
 
				 def random_signature():
			
 
				     """
			
 
				     随机生成签名
			
@@ -640,6 +694,7 @@ class XiGuaAuthor:
 
				     """
			
 
				     西瓜账号爬虫
			
 
				     """
			
 
				+
			
 
				     def __init__(self, platform, mode, rule_dict, env, user_list):
			
 
				         self.platform = platform
			
 
				         self.mode = mode
			
@@ -656,30 +711,33 @@ class XiGuaAuthor:
 
				         :param account: 输入的账号信息
			
 
				         {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
			
 
				         """
			
 
				-        flag = account['link'].split("_")[0]
			
 
				-        if flag == "V1":
			
 
				-            rule_dict = {
			
 
				-                "play_cnt": {"min": 100000, "max": 0},
			
 
				-                'period': {"min": 90, "max": 90},
			
 
				-                'special': 0.02
			
 
				-            }
			
 
				-            return rule_dict
			
 
				-        elif flag == "V2":
			
 
				-            rule_dict = {
			
 
				-                "play_cnt": {"min": 10000, "max": 0},
			
 
				-                'period': {"min": 90, "max": 90},
			
 
				-                'special': 0.01
			
 
				-            }
			
 
				-            return rule_dict
			
 
				-        elif flag == "V3":
			
 
				-            rule_dict = {
			
 
				-                "play_cnt": {"min": 5000, "max": 0},
			
 
				-                'period': {"min": 90, "max": 90},
			
 
				-                'special': 0.01
			
 
				-            }
			
 
				-            return rule_dict
			
 
				-        else:
			
 
				+        temp = account['link'].split("_")
			
 
				+        if len(temp) == 1:
			
 
				             return self.rule_dict
			
 
				+        else:
			
 
				+            flag = temp[-2]
			
 
				+            match flag:
			
 
				+                case "V1":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 100000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.02
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				+                case "V2":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 10000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.01
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				+                case "V3":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 5000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.01
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				 
			
 
				     def get_author_list(self):
			
 
				         """
			
@@ -690,7 +748,19 @@ class XiGuaAuthor:
 
				         for user_dict in self.user_list:
			
 
				             # if self.download_count <= max_count:
			
 
				             try:
			
 
				-                self.get_video_list(user_dict)
			
 
				+                flag = user_dict["link"][0]
			
 
				+                match flag:
			
 
				+                    case "V":
			
 
				+                        self.get_video_list(user_dict)
			
 
				+                    case "X":
			
 
				+                        self.get_tiny_video_list(user_dict)
			
 
				+                    case "h":
			
 
				+                        self.get_video_list(user_dict)
			
 
				+                    case "D":
			
 
				+                        self.get_video_list(user_dict)
			
 
				+                    case "B":
			
 
				+                        self.get_video_list(user_dict)
			
 
				+                        self.get_tiny_video_list(user_dict)
			
 
				             except Exception as e:
			
 
				                 AliyunLogger.logging(
			
 
				                     code="3001",
			
@@ -714,15 +784,13 @@ class XiGuaAuthor:
 
				     def get_video_list(self, user_dict):
			
 
				         """
			
 
				         获取某个账号的视频列表
			
 
				+        账号分为 3 类
			
 
				         """
			
 
				         offset = 0
			
 
				         signature = random_signature()
			
 
				+        link = user_dict['link'].split("_")[-1]
			
 
				         url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
			
 
				         while True:
			
 
				-            if user_dict['link'][0] == "V":
			
 
				-                link = user_dict["link"][3:]
			
 
				-            else:
			
 
				-                link = user_dict["link"]
			
 
				             to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
			
 
				             params = {
			
 
				                 "to_user_id": to_user_id,
			
@@ -731,8 +799,6 @@ class XiGuaAuthor:
 
				                 "maxBehotTime": "0",
			
 
				                 "order": "new",
			
 
				                 "isHome": "0",
			
 
				-                # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
			
 
				-                # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
			
 
				                 "_signature": signature,
			
 
				             }
			
 
				             headers = {
			
@@ -749,7 +815,7 @@ class XiGuaAuthor:
 
				             offset += 30
			
 
				             if "data" not in response.text or response.status_code != 200:
			
 
				                 AliyunLogger.logging(
			
 
				-                    code="2000",
			
 
				+                    code="3000",
			
 
				                     platform=self.platform,
			
 
				                     mode=self.mode,
			
 
				                     env=self.env,
			
@@ -758,7 +824,7 @@ class XiGuaAuthor:
 
				                 return
			
 
				             elif not response.json()["data"]["videoList"]:
			
 
				                 AliyunLogger.logging(
			
 
				-                    code="2000",
			
 
				+                    code="3000",
			
 
				                     platform=self.platform,
			
 
				                     mode=self.mode,
			
 
				                     env=self.env,
			
@@ -778,7 +844,7 @@ class XiGuaAuthor:
 
				                             data=video_obj,
			
 
				                             message="扫描到一条视频",
			
 
				                         )
			
 
				-                        date_flag = self.process_video_obj(video_obj, user_dict)
			
 
				+                        date_flag = self.process_video_obj(video_obj, user_dict, "l")
			
 
				                         if not date_flag:
			
 
				                             return
			
 
				                     except Exception as e:
			
@@ -791,10 +857,86 @@ class XiGuaAuthor:
 
				                             message="抓取单条视频异常, 报错原因是: {}".format(e),
			
 
				                         )
			
 
				 
			
 
				-    def process_video_obj(self, video_obj, user_dict):
			
 
				+    def get_tiny_video_list(self, user_dict):
			
 
				+        """
			
 
				+        获取小视频
			
 
				+        """
			
 
				+        url = "https://www.ixigua.com/api/videov2/hotsoon/video"
			
 
				+        max_behot_time = "0"
			
 
				+        link = user_dict['link'].split("_")[-1]
			
 
				+        to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
			
 
				+        while True:
			
 
				+            params = {
			
 
				+                "to_user_id": to_user_id,
			
 
				+                "max_behot_time": max_behot_time,
			
 
				+                "_signature": random_signature()
			
 
				+            }
			
 
				+            headers = {
			
 
				+                "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
			
 
				+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
			
 
				+            }
			
 
				+            response = requests.get(
			
 
				+                url=url,
			
 
				+                headers=headers,
			
 
				+                params=params,
			
 
				+                proxies=tunnel_proxies(),
			
 
				+                timeout=5,
			
 
				+            )
			
 
				+            if "data" not in response.text or response.status_code != 200:
			
 
				+                AliyunLogger.logging(
			
 
				+                    code="2000",
			
 
				+                    platform=self.platform,
			
 
				+                    mode=self.mode,
			
 
				+                    env=self.env,
			
 
				+                    message=f"get_videoList:{response.text}\n",
			
 
				+                )
			
 
				+                return
			
 
				+            elif not response.json()["data"]["data"]:
			
 
				+                AliyunLogger.logging(
			
 
				+                    code="2000",
			
 
				+                    platform=self.platform,
			
 
				+                    mode=self.mode,
			
 
				+                    env=self.env,
			
 
				+                    message=f"没有更多数据啦~\n",
			
 
				+                )
			
 
				+                return
			
 
				+            else:
			
 
				+                video_list = response.json()['data']['data']
			
 
				+                max_behot_time = video_list[-1]["max_behot_time"]
			
 
				+                for video_obj in video_list:
			
 
				+                    try:
			
 
				+                        AliyunLogger.logging(
			
 
				+                            code="1001",
			
 
				+                            account=user_dict['uid'],
			
 
				+                            platform=self.platform,
			
 
				+                            mode=self.mode,
			
 
				+                            env=self.env,
			
 
				+                            data=video_obj,
			
 
				+                            message="扫描到一条小视频",
			
 
				+                        )
			
 
				+                        date_flag = self.process_video_obj(video_obj, user_dict, "s")
			
 
				+                        if not date_flag:
			
 
				+                            return
			
 
				+                    except Exception as e:
			
 
				+                        AliyunLogger.logging(
			
 
				+                            code="3000",
			
 
				+                            platform=self.platform,
			
 
				+                            mode=self.mode,
			
 
				+                            env=self.env,
			
 
				+                            data=video_obj,
			
 
				+                            message="抓取单条视频异常, 报错原因是: {}".format(e),
			
 
				+                        )
			
 
				+
			
 
				+    def process_video_obj(self, video_obj, user_dict, f):
			
 
				+        """
			
 
				+        process video_obj and extract video_url
			
 
				+        """
			
 
				         new_rule = self.rule_maker(user_dict)
			
 
				         trace_id = self.platform + str(uuid.uuid1())
			
 
				-        item_id = video_obj.get("item_id", "")
			
 
				+        if f == "s":
			
 
				+            item_id = video_obj.get("id_str", "")
			
 
				+        else:
			
 
				+            item_id = video_obj.get("item_id", "")
			
 
				         if not item_id:
			
 
				             AliyunLogger.logging(
			
 
				                 code="2005",
			
@@ -905,44 +1047,19 @@ class XiGuaAuthor:
 
				             return True
			
 
				 
			
 
				     def get_video_info(self, item_id, trace_id):
			
 
				-        url = "https://www.ixigua.com/api/mixVideo/information?"
			
 
				+        """
			
 
				+        获取视频信息
			
 
				+        """
			
 
				+        url = "https://www.ixigua.com/{}".format(item_id)
			
 
				         headers = {
			
 
				             "accept-encoding": "gzip, deflate",
			
 
				             "accept-language": "zh-CN,zh-Hans;q=0.9",
			
 
				             "user-agent": FakeUserAgent().random,
			
 
				-            "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
			
 
				-        }
			
 
				-        params = {
			
 
				-            "mixId": str(item_id),
			
 
				-            "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
			
 
				-                       "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
			
 
				-            "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
			
 
				-            "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
			
 
				-                          "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
			
 
				-        }
			
 
				-        cookies = {
			
 
				-            "ixigua-a-s": "1",
			
 
				-            "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
			
 
				-                       "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
			
 
				-            "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
			
 
				-                     "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
			
 
				-            "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
			
 
				-            "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
			
 
				-            "__ac_nonce": "06304878000964fdad287",
			
 
				-            "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
			
 
				-                              "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
			
 
				-            "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
			
 
				-            "_tea_utm_cache_1300": "undefined",
			
 
				-            "support_avif": "false",
			
 
				-            "support_webp": "false",
			
 
				-            "xiguavideopcwebid": "7134967546256016900",
			
 
				-            "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
			
 
				+            "referer": "https://www.ixigua.com/{}/".format(item_id),
			
 
				         }
			
 
				         response = requests.get(
			
 
				             url=url,
			
 
				             headers=headers,
			
 
				-            params=params,
			
 
				-            cookies=cookies,
			
 
				             proxies=tunnel_proxies(),
			
 
				             timeout=5,
			
 
				         )
			
@@ -961,41 +1078,29 @@ class XiGuaAuthor:
 
				             )
			
 
				             return None
			
 
				         else:
			
 
				-            video_info = (
			
 
				-                response.json()["data"]
			
 
				-                .get("gidInformation", {})
			
 
				-                .get("packerData", {})
			
 
				-                .get("video", {})
			
 
				-            )
			
 
				-            if video_info == {}:
			
 
				-                return None
			
 
				-            video_detail = get_video_url(video_info)
			
 
				-
			
 
				+            video_info = extract_info_by_re(response.text)
			
 
				             video_dict = {
			
 
				                 "video_title": video_info.get("title", ""),
			
 
				-                "video_id": video_info.get("videoResource", {}).get("vid", ""),
			
 
				+                "video_id": video_info.get("video_id"),
			
 
				                 "gid": str(item_id),
			
 
				-                "play_cnt": int(video_info.get("video_watch_count", 0)),
			
 
				-                "like_cnt": int(video_info.get("video_like_count", 0)),
			
 
				-                "comment_cnt": int(get_comment_cnt(item_id)),
			
 
				+                "play_cnt": int(video_info.get("play_count", 0)),
			
 
				+                "like_cnt": int(video_info.get("like_count", 0)),
			
 
				+                "comment_cnt": 0,
			
 
				                 "share_cnt": 0,
			
 
				                 "favorite_cnt": 0,
			
 
				-                "duration": int(video_info.get("video_duration", 0)),
			
 
				-                "video_width": int(video_detail["video_width"]),
			
 
				-                "video_height": int(video_detail["video_height"]),
			
 
				-                "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
			
 
				+                "duration": int(video_info.get("duration", 0)),
			
 
				+                "video_width": 0,
			
 
				+                "video_height": 0,
			
 
				+                "publish_time_stamp": int(video_info.get("publish_time", 0)),
			
 
				                 "publish_time_str": time.strftime(
			
 
				                     "%Y-%m-%d %H:%M:%S",
			
 
				-                    time.localtime(int(video_info.get("video_publish_time", 0))),
			
 
				+                    time.localtime(int(video_info.get("publish_time", 0))),
			
 
				                 ),
			
 
				-                "user_name": video_info.get("user_info", {}).get("name", ""),
			
 
				-                "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
			
 
				                 "avatar_url": str(
			
 
				                     video_info.get("user_info", {}).get("avatar_url", "")
			
 
				                 ),
			
 
				-                "cover_url": video_info.get("poster_url", ""),
			
 
				-                "audio_url": video_detail["audio_url"],
			
 
				-                "video_url": video_detail["video_url"],
			
 
				+                "cover_url": video_info.get("cover_url", ""),
			
 
				+                "video_url": video_info.get("url"),
			
 
				                 "session": f"xigua-search-{int(time.time())}",
			
 
				             }
			
 
				             return video_dict