Explorar el Código

重新上线 limit.py

上线 manage_accounts.py, 用于处理长沙同学账号
罗俊辉 hace 1 año
padre
commit
cdaeb4de69
Se han modificado 1 ficheros con 194 adiciones y 89 borrados
  1. 194 89
      xigua/xigua_author/xigua_author.py

+ 194 - 89
xigua/xigua_author/xigua_author.py

@@ -1,5 +1,6 @@
 import json
 import os
+import re
 import random
 import sys
 import string
@@ -17,6 +18,59 @@ from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
 from common.limit import AuthorLimit
 
 
+def extract_info_by_re(text):
+    """
+    通过正则表达式获取文本中的信息
+    :param text:
+    :return:
+    """
+    # 标题
+    title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
+    if title_match:
+        title_content = title_match.group(1)
+        title_content = title_content.split(" - ")[0]
+        title_content = bytes(title_content, "latin1").decode()
+    else:
+        title_content = ""
+    # video_url
+    main_url = re.search(r'("main_url":")(.*?)"', text)[0]
+    main_url = main_url.split(":")[1]
+    decoded_data = base64.b64decode(main_url)
+    try:
+        # 尝试使用utf-8解码
+        video_url = decoded_data.decode()
+    except UnicodeDecodeError:
+        # 如果utf-8解码失败,尝试使用其他编码方式
+        video_url = decoded_data.decode('latin-1')
+
+    # video_id
+    video_id = re.search(r'"vid":"(.*?)"', text).group(1)
+
+    # like_count
+    like_count = re.search(r'"video_like_count":"(.*?)"', text).group(1)
+
+    # cover_url
+    cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
+
+    # video_play
+    video_watch_count = re.search(r'"video_watch_count":"(.*?)"', text).group(1)
+
+    # "video_publish_time"
+    publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
+
+    # video_duration
+    duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
+    return {
+        "title": title_content,
+        "url": video_url,
+        "video_id": video_id,
+        "like_count": like_count,
+        "cover_url": cover_url,
+        "play_count": video_watch_count,
+        "publish_time": publish_time,
+        "duration": duration
+    }
+
 def random_signature():
     """
     随机生成签名
@@ -640,6 +694,7 @@ class XiGuaAuthor:
     """
     西瓜账号爬虫
     """
+
     def __init__(self, platform, mode, rule_dict, env, user_list):
         self.platform = platform
         self.mode = mode
@@ -656,30 +711,33 @@ class XiGuaAuthor:
         :param account: 输入的账号信息
         {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
         """
-        flag = account['link'].split("_")[0]
-        if flag == "V1":
-            rule_dict = {
-                "play_cnt": {"min": 100000, "max": 0},
-                'period': {"min": 90, "max": 90},
-                'special': 0.02
-            }
-            return rule_dict
-        elif flag == "V2":
-            rule_dict = {
-                "play_cnt": {"min": 10000, "max": 0},
-                'period': {"min": 90, "max": 90},
-                'special': 0.01
-            }
-            return rule_dict
-        elif flag == "V3":
-            rule_dict = {
-                "play_cnt": {"min": 5000, "max": 0},
-                'period': {"min": 90, "max": 90},
-                'special': 0.01
-            }
-            return rule_dict
-        else:
+        temp = account['link'].split("_")
+        if len(temp) == 1:
             return self.rule_dict
+        else:
+            flag = temp[-2]
+            match flag:
+                case "V1":
+                    rule_dict = {
+                        "play_cnt": {"min": 100000, "max": 0},
+                        'period': {"min": 90, "max": 90},
+                        'special': 0.02
+                    }
+                    return rule_dict
+                case "V2":
+                    rule_dict = {
+                        "play_cnt": {"min": 10000, "max": 0},
+                        'period': {"min": 90, "max": 90},
+                        'special': 0.01
+                    }
+                    return rule_dict
+                case "V3":
+                    rule_dict = {
+                        "play_cnt": {"min": 5000, "max": 0},
+                        'period': {"min": 90, "max": 90},
+                        'special': 0.01
+                    }
+                    return rule_dict
 
     def get_author_list(self):
         """
@@ -690,7 +748,19 @@ class XiGuaAuthor:
         for user_dict in self.user_list:
             # if self.download_count <= max_count:
             try:
-                self.get_video_list(user_dict)
+                flag = user_dict["link"][0]
+                match flag:
+                    case "V":
+                        self.get_video_list(user_dict)
+                    case "X":
+                        self.get_tiny_video_list(user_dict)
+                    case "h":
+                        self.get_video_list(user_dict)
+                    case "D":
+                        self.get_video_list(user_dict)
+                    case "B":
+                        self.get_video_list(user_dict)
+                        self.get_tiny_video_list(user_dict)
             except Exception as e:
                 AliyunLogger.logging(
                     code="3001",
@@ -714,15 +784,13 @@ class XiGuaAuthor:
     def get_video_list(self, user_dict):
         """
         获取某个账号的视频列表
+        账号分为 3 类
         """
         offset = 0
         signature = random_signature()
+        link = user_dict['link'].split("_")[-1]
         url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
         while True:
-            if user_dict['link'][0] == "V":
-                link = user_dict["link"][3:]
-            else:
-                link = user_dict["link"]
             to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
             params = {
                 "to_user_id": to_user_id,
@@ -731,8 +799,6 @@ class XiGuaAuthor:
                 "maxBehotTime": "0",
                 "order": "new",
                 "isHome": "0",
-                # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
-                # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
                 "_signature": signature,
             }
             headers = {
@@ -749,7 +815,7 @@ class XiGuaAuthor:
             offset += 30
             if "data" not in response.text or response.status_code != 200:
                 AliyunLogger.logging(
-                    code="2000",
+                    code="3000",
                     platform=self.platform,
                     mode=self.mode,
                     env=self.env,
@@ -758,7 +824,7 @@ class XiGuaAuthor:
                 return
             elif not response.json()["data"]["videoList"]:
                 AliyunLogger.logging(
-                    code="2000",
+                    code="3000",
                     platform=self.platform,
                     mode=self.mode,
                     env=self.env,
@@ -778,7 +844,7 @@ class XiGuaAuthor:
                             data=video_obj,
                             message="扫描到一条视频",
                         )
-                        date_flag = self.process_video_obj(video_obj, user_dict)
+                        date_flag = self.process_video_obj(video_obj, user_dict, "l")
                         if not date_flag:
                             return
                     except Exception as e:
@@ -791,10 +857,86 @@ class XiGuaAuthor:
                             message="抓取单条视频异常, 报错原因是: {}".format(e),
                         )
 
-    def process_video_obj(self, video_obj, user_dict):
+    def get_tiny_video_list(self, user_dict):
+        """
+        获取小视频
+        """
+        url = "https://www.ixigua.com/api/videov2/hotsoon/video"
+        max_behot_time = "0"
+        link = user_dict['link'].split("_")[-1]
+        to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
+        while True:
+            params = {
+                "to_user_id": to_user_id,
+                "max_behot_time": max_behot_time,
+                "_signature": random_signature()
+            }
+            headers = {
+                "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
+            }
+            response = requests.get(
+                url=url,
+                headers=headers,
+                params=params,
+                proxies=tunnel_proxies(),
+                timeout=5,
+            )
+            if "data" not in response.text or response.status_code != 200:
+                AliyunLogger.logging(
+                    code="2000",
+                    platform=self.platform,
+                    mode=self.mode,
+                    env=self.env,
+                    message=f"get_videoList:{response.text}\n",
+                )
+                return
+            elif not response.json()["data"]["data"]:
+                AliyunLogger.logging(
+                    code="2000",
+                    platform=self.platform,
+                    mode=self.mode,
+                    env=self.env,
+                    message=f"没有更多数据啦~\n",
+                )
+                return
+            else:
+                video_list = response.json()['data']['data']
+                max_behot_time = video_list[-1]["max_behot_time"]
+                for video_obj in video_list:
+                    try:
+                        AliyunLogger.logging(
+                            code="1001",
+                            account=user_dict['uid'],
+                            platform=self.platform,
+                            mode=self.mode,
+                            env=self.env,
+                            data=video_obj,
+                            message="扫描到一条小视频",
+                        )
+                        date_flag = self.process_video_obj(video_obj, user_dict, "s")
+                        if not date_flag:
+                            return
+                    except Exception as e:
+                        AliyunLogger.logging(
+                            code="3000",
+                            platform=self.platform,
+                            mode=self.mode,
+                            env=self.env,
+                            data=video_obj,
+                            message="抓取单条视频异常, 报错原因是: {}".format(e),
+                        )
+
+    def process_video_obj(self, video_obj, user_dict, f):
+        """
+        process video_obj and extract video_url
+        """
         new_rule = self.rule_maker(user_dict)
         trace_id = self.platform + str(uuid.uuid1())
-        item_id = video_obj.get("item_id", "")
+        if f == "s":
+            item_id = video_obj.get("id_str", "")
+        else:
+            item_id = video_obj.get("item_id", "")
         if not item_id:
             AliyunLogger.logging(
                 code="2005",
@@ -905,44 +1047,19 @@ class XiGuaAuthor:
             return True
 
     def get_video_info(self, item_id, trace_id):
-        url = "https://www.ixigua.com/api/mixVideo/information?"
+        """
+        获取视频信息
+        """
+        url = "https://www.ixigua.com/{}".format(item_id)
         headers = {
             "accept-encoding": "gzip, deflate",
             "accept-language": "zh-CN,zh-Hans;q=0.9",
             "user-agent": FakeUserAgent().random,
-            "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
-        }
-        params = {
-            "mixId": str(item_id),
-            "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
-                       "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
-            "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
-            "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
-                          "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
-        }
-        cookies = {
-            "ixigua-a-s": "1",
-            "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
-                       "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
-            "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
-                     "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
-            "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
-            "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
-            "__ac_nonce": "06304878000964fdad287",
-            "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
-                              "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
-            "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
-            "_tea_utm_cache_1300": "undefined",
-            "support_avif": "false",
-            "support_webp": "false",
-            "xiguavideopcwebid": "7134967546256016900",
-            "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
+            "referer": "https://www.ixigua.com/{}/".format(item_id),
         }
         response = requests.get(
             url=url,
             headers=headers,
-            params=params,
-            cookies=cookies,
             proxies=tunnel_proxies(),
             timeout=5,
         )
@@ -961,41 +1078,29 @@ class XiGuaAuthor:
             )
             return None
         else:
-            video_info = (
-                response.json()["data"]
-                .get("gidInformation", {})
-                .get("packerData", {})
-                .get("video", {})
-            )
-            if video_info == {}:
-                return None
-            video_detail = get_video_url(video_info)
-
+            video_info = extract_info_by_re(response.text)
             video_dict = {
                 "video_title": video_info.get("title", ""),
-                "video_id": video_info.get("videoResource", {}).get("vid", ""),
+                "video_id": video_info.get("video_id"),
                 "gid": str(item_id),
-                "play_cnt": int(video_info.get("video_watch_count", 0)),
-                "like_cnt": int(video_info.get("video_like_count", 0)),
-                "comment_cnt": int(get_comment_cnt(item_id)),
+                "play_cnt": int(video_info.get("play_count", 0)),
+                "like_cnt": int(video_info.get("like_count", 0)),
+                "comment_cnt": 0,
                 "share_cnt": 0,
                 "favorite_cnt": 0,
-                "duration": int(video_info.get("video_duration", 0)),
-                "video_width": int(video_detail["video_width"]),
-                "video_height": int(video_detail["video_height"]),
-                "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
+                "duration": int(video_info.get("duration", 0)),
+                "video_width": 0,
+                "video_height": 0,
+                "publish_time_stamp": int(video_info.get("publish_time", 0)),
                 "publish_time_str": time.strftime(
                     "%Y-%m-%d %H:%M:%S",
-                    time.localtime(int(video_info.get("video_publish_time", 0))),
+                    time.localtime(int(video_info.get("publish_time", 0))),
                 ),
-                "user_name": video_info.get("user_info", {}).get("name", ""),
-                "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
                 "avatar_url": str(
                     video_info.get("user_info", {}).get("avatar_url", "")
                 ),
-                "cover_url": video_info.get("poster_url", ""),
-                "audio_url": video_detail["audio_url"],
-                "video_url": video_detail["video_url"],
+                "cover_url": video_info.get("cover_url", ""),
+                "video_url": video_info.get("url"),
                 "session": f"xigua-search-{int(time.time())}",
             }
             return video_dict