2 tahun lalu · 7de43f0174
--- a/changsha_bot.py
+++ b/changsha_bot.py
@@ -6,10 +6,28 @@ import datetime
 
				 import time
			
 
				 
			
 
				 import requests
			
 
				+import multiprocessing
			
 
				 
			
 
				 from common.db import RedisClient
			
 
				 
			
 
				 
			
 
				+def protect_spider_timeout(function):
			
 
				+    """
			
 
				+    守护进程，在程序启动后的某一个时段内守护爬虫进程
			
 
				+    :param function: 被守护的函数
			
 
				+    :param hour: 守护时长 / hour
			
 
				+    """
			
 
				+    process = multiprocessing.Process(target=function)
			
 
				+    process.start()
			
 
				+    while True:
			
 
				+        if not process.is_alive():
			
 
				+            process.terminate()
			
 
				+            time.sleep(60)
			
 
				+            process = multiprocessing.Process(target=function)
			
 
				+            process.start()
			
 
				+        time.sleep(60)
			
 
				+
			
 
				+
			
 
				 def bot(name):
			
 
				     """
			
 
				     报警机器人
			
@@ -55,12 +73,6 @@ def monitor():
 
				     """
			
 
				     监测 redis 中数据
			
 
				     """
			
 
				-    counts_info = {
			
 
				-        "罗情": True,
			
 
				-        "余海涛": True,
			
 
				-        "范军": True,
			
 
				-        "鲁涛": True
			
 
				-    }
			
 
				     keys = {"352": "余海涛", "353": "罗情", "53": "范军", "51": "鲁涛"}
			
 
				     now = datetime.datetime.now().time()
			
 
				     start_alert_time = datetime.time(10)
			
@@ -75,14 +87,32 @@ def monitor():
 
				                         if count:
			
 
				                             OO = int(count.decode("utf-8"))
			
 
				                             name = keys[key]
			
 
				-                            if OO >= 300 and counts_info[name]:
			
 
				-                                bot(name)
			
 
				-                                counts_info[name] = False
			
 
				+                            redis_date_key = key + "-" + datetime.date.today().strftime("%Y%m%d")
			
 
				+                            if R.select(redis_date_key):
			
 
				+                                # 说明已经存储进去了， 不需要再报警了
			
 
				+                                continue
			
 
				+                            else:
			
 
				+                                if OO > 300:
			
 
				+                                    R.insert(redis_date_key, "already bot", 86400)
			
 
				+                                    # print("超过了， 报警", name)
			
 
				+                                    bot(name)
			
 
				+                                else:
			
 
				+                                    continue
			
 
				             except Exception as e:
			
 
				+                print(e)
			
 
				                 pass
			
 
				             # 查询一次之后等待 60 s
			
 
				         time.sleep(60)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    monitor()
			
 
				+    protect_spider_timeout(monitor)
			
 
				+    # monitor()
			
 
				+    # R = RedisClient()
			
 
				+    # R.connect()
			
 
				+    # # w = R.select("53-20240327")
			
 
				+    # # print(w)
			
 
				+    # R.delete("51-20240327")
			
 
				+    # R.delete("53-20240327")
			
 
				+    # R.delete("353-20240327")
			
 
				+    # R.delete("352-20240327")
			
--- a/common/limit.py
+++ b/common/limit.py
@@ -31,43 +31,41 @@ class AuthorLimit(object):
 
				     def __init__(self, mode, platform):
			
 
				         self.mode = mode
			
 
				         self.platform = platform
			
 
				-        self.limit_tag_dict = {"352": "余海涛", "353": "罗情", "53": "范军", "51": "鲁涛", "131": False}
			
 
				+        self.limit_tag_dict = {
			
 
				+            "余海涛": "352",
			
 
				+            "罗情": "353",
			
 
				+            "范军": "53",
			
 
				+            "鲁涛": "51"
			
 
				+        }
			
 
				 
			
 
				     def find_tag(self, uid):
			
 
				         """
			
 
				-        通过 uid 去找符合标准的 tag
			
 
				+        判断 uid 是否存在changsha_user_accounts中
			
 
				         """
			
 
				-        sql = f"""select tag from crawler_user_v3 where uid={uid};"""
			
 
				+        sql = f"""select user_name from changsha_user_accounts where piaoquan_account_id = {uid};"""
			
 
				         result = MysqlHelper.get_values(
			
 
				             log_type=self.mode, crawler=self.platform, env="prod", sql=sql
			
 
				         )
			
 
				-        tags = result[0]["tag"]
			
 
				-        if tags:
			
 
				-            tags = tags.split(",")
			
 
				-            if "131" in tags:
			
 
				-                return None
			
 
				-            else:
			
 
				-                for tag in tags:
			
 
				-                    if self.limit_tag_dict.get(tag):
			
 
				-                        return tag
			
 
				-        return None
			
 
				+        return result
			
 
				 
			
 
				     def author_limitation(self, user_id):
			
 
				         """
			
 
				         限制账号， 服务长沙四名同学
			
 
				         """
			
 
				         if self.mode == "author":
			
 
				-            tag = self.find_tag(user_id)
			
 
				-            if tag:
			
 
				+            result = self.find_tag(user_id)
			
 
				+            if result:
			
 
				+                user_name = result[0]['user_name']
			
 
				                 AliyunLogger.logging(
			
 
				                     code="8807",
			
 
				                     platform=self.platform,
			
 
				                     mode=self.mode,
			
 
				                     env="prod",
			
 
				-                    message="找到个人账号，{}".format(tag)
			
 
				+                    message="找到个人账号，{}".format(user_name)
			
 
				                 )
			
 
				                 R = RedisClient()
			
 
				                 if R.connect():
			
 
				+                    tag = self.limit_tag_dict[user_name]
			
 
				                     tag_count = R.select(tag)
			
 
				                     if tag_count:
			
 
				                         tag_count = int(tag_count.decode("utf-8"))
			
--- a/manage_accounts.py
+++ b/manage_accounts.py
@@ -0,0 +1,115 @@
 
				+import time
			
 
				+import schedule
			
 
				+import multiprocessing
			
 
				+from common.scheduling_db import MysqlHelper
			
 
				+from common.aliyun_log import AliyunLogger
			
 
				+
			
 
				+
			
 
				+def read_accounts_from_mysql():
			
 
				+    """
			
 
				+    Read accounts from mysql database
			
 
				+    """
			
 
				+    sql = f"""select tag, uid from crawler_user_v3 order by create_time desc;"""
			
 
				+    result = MysqlHelper.get_values(
			
 
				+        log_type="author", crawler="changsha", env="prod", sql=sql
			
 
				+    )
			
 
				+    limit_tag_dict = {
			
 
				+        "352": "余海涛",
			
 
				+        "353": "罗情",
			
 
				+        "53": "范军",
			
 
				+        "51": "鲁涛",
			
 
				+        "131": "王雪珂",
			
 
				+        "6682": "公众新号",
			
 
				+        "469": "小年糕",
			
 
				+        "464": "快手",
			
 
				+        "5662": "快手账号爬虫",
			
 
				+        "459": "spider",
			
 
				+        "85": "快手爬虫",
			
 
				+        "454": "账号",
			
 
				+        "467": "视频号",
			
 
				+        "106": "⭐️小年糕爬虫",
			
 
				+        "120": "西瓜新爬虫",
			
 
				+        "499": "抖音",
			
 
				+        "2235": "抖音爬虫"
			
 
				+    }
			
 
				+    p_dict = {}
			
 
				+    for item in result:
			
 
				+        tag_list = item['tag'].split(",")
			
 
				+        tag_set = set(tag_list)
			
 
				+        require_set = {'454', '459'}
			
 
				+        forbidden_set = {'131', '465', '1379', '160'}
			
 
				+        if len(tag_set) >= 5:
			
 
				+            if require_set.issubset(tag_set) and forbidden_set.isdisjoint(tag_set):
			
 
				+                w = [limit_tag_dict.get(tag, None) for tag in tag_list]
			
 
				+                p_dict[item['uid']] = w
			
 
				+    return p_dict
			
 
				+
			
 
				+
			
 
				+def insert_accounts(account_dict):
			
 
				+    """
			
 
				+    把长沙同学账号插入到 changsha_accounts 中
			
 
				+    """
			
 
				+    for key in account_dict:
			
 
				+        select_sql = f"""select id from changsha_user_accounts where piaoquan_account_id = {key};"""
			
 
				+        result = MysqlHelper.get_values(
			
 
				+            log_type="author", crawler="changsha", env="prod", sql=select_sql
			
 
				+        )
			
 
				+        if result:
			
 
				+            continue
			
 
				+        tags = set(account_dict[key])
			
 
				+        name_set = {'鲁涛', '罗情', '余海涛', '范军'}
			
 
				+        platform_set = {'西瓜新爬虫', '快手账号爬虫', '公众新号', '⭐️小年糕爬虫', '抖音爬虫', '视频号'}
			
 
				+        name = tags & name_set
			
 
				+        platform = tags & platform_set
			
 
				+        if name and platform:
			
 
				+            user_name = list(name)[0]
			
 
				+            platform_name = list(platform)[0]
			
 
				+            sql = f"""INSERT INTO changsha_user_accounts (piaoquan_account_id, user_name, platform) VALUES ('{key}', '{user_name}', '{platform_name}');"""
			
 
				+            MysqlHelper.update_values(log_type="author", crawler="changsha", sql=sql, env="prod")
			
 
				+            AliyunLogger.logging(
			
 
				+                code="8888",
			
 
				+                platform=platform,
			
 
				+                mode="author",
			
 
				+                env="prod",
			
 
				+                message="更新账号-{}-{}".format(user_name, key)
			
 
				+            )
			
 
				+
			
 
				+
			
 
				+def protect_(function):
			
 
				+    """
			
 
				+    守护进程，在程序启动后的某一个时段内守护爬虫进程
			
 
				+    :param function: 被守护的函数
			
 
				+    """
			
 
				+    process = multiprocessing.Process(target=function)
			
 
				+    process.start()
			
 
				+    while True:
			
 
				+        if not process.is_alive():
			
 
				+            process.terminate()
			
 
				+            time.sleep(60)
			
 
				+            process = multiprocessing.Process(target=function)
			
 
				+            process.start()
			
 
				+        time.sleep(60)
			
 
				+
			
 
				+
			
 
				+def process_acc():
			
 
				+    """
			
 
				+    执行函数
			
 
				+    """
			
 
				+    dd_dict = read_accounts_from_mysql()
			
 
				+    insert_accounts(dd_dict)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    定时执行任务， 每天晚上更新账号
			
 
				+    """
			
 
				+    schedule.every().day.at("23:45").do(process_acc)
			
 
				+    while True:
			
 
				+        schedule.run_pending()  # 运行待处理的任务
			
 
				+        time.sleep(1)  # 每隔一秒检查一次是否有待执行的任务
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # protect_(main)
			
 
				+    # process_acc()
			
 
				+    main()
			
--- a/test.py
+++ b/test.py
--- a/xigua/xigua_author/xigua_author.py
+++ b/xigua/xigua_author/xigua_author.py
@@ -1,5 +1,6 @@
 
				 import json
			
 
				 import os
			
 
				+import re
			
 
				 import random
			
 
				 import sys
			
 
				 import string
			
@@ -17,6 +18,59 @@ from common import AliyunLogger, PiaoQuanPipeline, tunnel_proxies
 
				 from common.limit import AuthorLimit
			
 
				 
			
 
				 
			
 
				+def extract_info_by_re(text):
			
 
				+    """
			
 
				+    通过正则表达式获取文本中的信息
			
 
				+    :param text:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 标题
			
 
				+    title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
			
 
				+    if title_match:
			
 
				+        title_content = title_match.group(1)
			
 
				+        title_content = title_content.split(" - ")[0]
			
 
				+        title_content = bytes(title_content, "latin1").decode()
			
 
				+    else:
			
 
				+        title_content = ""
			
 
				+    # video_url
			
 
				+    main_url = re.search(r'("main_url":")(.*?)"', text)[0]
			
 
				+    main_url = main_url.split(":")[1]
			
 
				+    decoded_data = base64.b64decode(main_url)
			
 
				+    try:
			
 
				+        # 尝试使用utf-8解码
			
 
				+        video_url = decoded_data.decode()
			
 
				+    except UnicodeDecodeError:
			
 
				+        # 如果utf-8解码失败，尝试使用其他编码方式
			
 
				+        video_url = decoded_data.decode('latin-1')
			
 
				+
			
 
				+    # video_id
			
 
				+    video_id = re.search(r'"vid":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # like_count
			
 
				+    like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
			
 
				+
			
 
				+    # cover_url
			
 
				+    cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # video_play
			
 
				+    video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
			
 
				+
			
 
				+    # "video_publish_time"
			
 
				+    publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # video_duration
			
 
				+    duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
			
 
				+    return {
			
 
				+        "title": title_content,
			
 
				+        "url": video_url,
			
 
				+        "video_id": video_id,
			
 
				+        "like_count": like_count,
			
 
				+        "cover_url": cover_url,
			
 
				+        "play_count": video_watch_count,
			
 
				+        "publish_time": publish_time,
			
 
				+        "duration": duration
			
 
				+    }
			
 
				+
			
 
				 def random_signature():
			
 
				     """
			
 
				     随机生成签名
			
@@ -48,6 +102,23 @@ def random_signature():
 
				         new_password = new_password_start + "y" + new_password_end
			
 
				     return new_password
			
 
				 
			
 
				+def byte_dance_cookie(item_id):
			
 
				+    """
			
 
				+    获取西瓜视频的 cookie
			
 
				+    :param item_id:
			
 
				+    """
			
 
				+    sess = requests.Session()
			
 
				+    sess.headers.update({
			
 
				+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
			
 
				+        'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
			
 
				+    })
			
 
				+
			
 
				+    # 获取 cookies
			
 
				+    sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
			
 
				+    data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
			
 
				+    r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
			
 
				+    # print(r.text)
			
 
				+    return r.cookies.values()[0]
			
 
				 
			
 
				 def get_video_url(video_info):
			
 
				     """
			
@@ -640,6 +711,7 @@ class XiGuaAuthor:
 
				     """
			
 
				     西瓜账号爬虫
			
 
				     """
			
 
				+
			
 
				     def __init__(self, platform, mode, rule_dict, env, user_list):
			
 
				         self.platform = platform
			
 
				         self.mode = mode
			
@@ -656,30 +728,33 @@ class XiGuaAuthor:
 
				         :param account: 输入的账号信息
			
 
				         {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
			
 
				         """
			
 
				-        flag = account['link'].split("_")[0]
			
 
				-        if flag == "V1":
			
 
				-            rule_dict = {
			
 
				-                "play_cnt": {"min": 100000, "max": 0},
			
 
				-                'period': {"min": 90, "max": 90},
			
 
				-                'special': 0.02
			
 
				-            }
			
 
				-            return rule_dict
			
 
				-        elif flag == "V2":
			
 
				-            rule_dict = {
			
 
				-                "play_cnt": {"min": 10000, "max": 0},
			
 
				-                'period': {"min": 90, "max": 90},
			
 
				-                'special': 0.01
			
 
				-            }
			
 
				-            return rule_dict
			
 
				-        elif flag == "V3":
			
 
				-            rule_dict = {
			
 
				-                "play_cnt": {"min": 5000, "max": 0},
			
 
				-                'period': {"min": 90, "max": 90},
			
 
				-                'special': 0.01
			
 
				-            }
			
 
				-            return rule_dict
			
 
				-        else:
			
 
				+        temp = account['link'].split("?")[0].split("_")
			
 
				+        if len(temp) == 1:
			
 
				             return self.rule_dict
			
 
				+        else:
			
 
				+            flag = temp[-2]
			
 
				+            match flag:
			
 
				+                case "V1":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 100000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.02
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				+                case "V2":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 10000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.01
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				+                case "V3":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 5000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.01
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				 
			
 
				     def get_author_list(self):
			
 
				         """
			
@@ -690,7 +765,19 @@ class XiGuaAuthor:
 
				         for user_dict in self.user_list:
			
 
				             # if self.download_count <= max_count:
			
 
				             try:
			
 
				-                self.get_video_list(user_dict)
			
 
				+                flag = user_dict["link"][0]
			
 
				+                match flag:
			
 
				+                    case "V":
			
 
				+                        self.get_video_list(user_dict)
			
 
				+                    case "X":
			
 
				+                        self.get_tiny_video_list(user_dict)
			
 
				+                    case "h":
			
 
				+                        self.get_video_list(user_dict)
			
 
				+                    case "D":
			
 
				+                        self.get_video_list(user_dict)
			
 
				+                    case "B":
			
 
				+                        self.get_video_list(user_dict)
			
 
				+                        self.get_tiny_video_list(user_dict)
			
 
				             except Exception as e:
			
 
				                 AliyunLogger.logging(
			
 
				                     code="3001",
			
@@ -714,15 +801,13 @@ class XiGuaAuthor:
 
				     def get_video_list(self, user_dict):
			
 
				         """
			
 
				         获取某个账号的视频列表
			
 
				+        账号分为 3 类
			
 
				         """
			
 
				         offset = 0
			
 
				         signature = random_signature()
			
 
				+        link = user_dict['link'].split("?")[0].split("_")[-1]
			
 
				         url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
			
 
				         while True:
			
 
				-            if user_dict['link'][0] == "V":
			
 
				-                link = user_dict["link"][3:]
			
 
				-            else:
			
 
				-                link = user_dict["link"]
			
 
				             to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
			
 
				             params = {
			
 
				                 "to_user_id": to_user_id,
			
@@ -731,8 +816,6 @@ class XiGuaAuthor:
 
				                 "maxBehotTime": "0",
			
 
				                 "order": "new",
			
 
				                 "isHome": "0",
			
 
				-                # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
			
 
				-                # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
			
 
				                 "_signature": signature,
			
 
				             }
			
 
				             headers = {
			
@@ -749,7 +832,7 @@ class XiGuaAuthor:
 
				             offset += 30
			
 
				             if "data" not in response.text or response.status_code != 200:
			
 
				                 AliyunLogger.logging(
			
 
				-                    code="2000",
			
 
				+                    code="3000",
			
 
				                     platform=self.platform,
			
 
				                     mode=self.mode,
			
 
				                     env=self.env,
			
@@ -758,10 +841,12 @@ class XiGuaAuthor:
 
				                 return
			
 
				             elif not response.json()["data"]["videoList"]:
			
 
				                 AliyunLogger.logging(
			
 
				-                    code="2000",
			
 
				+                    account=link,
			
 
				+                    code="3000",
			
 
				                     platform=self.platform,
			
 
				                     mode=self.mode,
			
 
				                     env=self.env,
			
 
				+                    data=response.json(),
			
 
				                     message=f"没有更多数据啦~\n",
			
 
				                 )
			
 
				                 return
			
@@ -778,7 +863,7 @@ class XiGuaAuthor:
 
				                             data=video_obj,
			
 
				                             message="扫描到一条视频",
			
 
				                         )
			
 
				-                        date_flag = self.process_video_obj(video_obj, user_dict)
			
 
				+                        date_flag = self.process_video_obj(video_obj, user_dict, "l")
			
 
				                         if not date_flag:
			
 
				                             return
			
 
				                     except Exception as e:
			
@@ -791,10 +876,88 @@ class XiGuaAuthor:
 
				                             message="抓取单条视频异常, 报错原因是: {}".format(e),
			
 
				                         )
			
 
				 
			
 
				-    def process_video_obj(self, video_obj, user_dict):
			
 
				+    def get_tiny_video_list(self, user_dict):
			
 
				+        """
			
 
				+        获取小视频
			
 
				+        """
			
 
				+        url = "https://www.ixigua.com/api/videov2/hotsoon/video"
			
 
				+        max_behot_time = "0"
			
 
				+        link = user_dict['link'].split("?")[0].split("_")[-1]
			
 
				+        to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
			
 
				+        while True:
			
 
				+            params = {
			
 
				+                "to_user_id": to_user_id,
			
 
				+                "max_behot_time": max_behot_time,
			
 
				+                "_signature": random_signature()
			
 
				+            }
			
 
				+            headers = {
			
 
				+                "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
			
 
				+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
			
 
				+            }
			
 
				+            response = requests.get(
			
 
				+                url=url,
			
 
				+                headers=headers,
			
 
				+                params=params,
			
 
				+                proxies=tunnel_proxies(),
			
 
				+                timeout=5,
			
 
				+            )
			
 
				+            if "data" not in response.text or response.status_code != 200:
			
 
				+                AliyunLogger.logging(
			
 
				+                    code="2000",
			
 
				+                    platform=self.platform,
			
 
				+                    mode=self.mode,
			
 
				+                    env=self.env,
			
 
				+                    message=f"get_videoList:{response.text}\n",
			
 
				+                )
			
 
				+                return
			
 
				+            elif not response.json()["data"]["data"]:
			
 
				+                AliyunLogger.logging(
			
 
				+                    account=link,
			
 
				+                    code="2000",
			
 
				+                    platform=self.platform,
			
 
				+                    mode=self.mode,
			
 
				+                    env=self.env,
			
 
				+                    data=response.json(),
			
 
				+                    message=f"没有更多数据啦~\n",
			
 
				+                )
			
 
				+                return
			
 
				+            else:
			
 
				+                video_list = response.json()['data']['data']
			
 
				+                max_behot_time = video_list[-1]["max_behot_time"]
			
 
				+                for video_obj in video_list:
			
 
				+                    try:
			
 
				+                        AliyunLogger.logging(
			
 
				+                            code="1001",
			
 
				+                            account=user_dict['uid'],
			
 
				+                            platform=self.platform,
			
 
				+                            mode=self.mode,
			
 
				+                            env=self.env,
			
 
				+                            data=video_obj,
			
 
				+                            message="扫描到一条小视频",
			
 
				+                        )
			
 
				+                        date_flag = self.process_video_obj(video_obj, user_dict, "s")
			
 
				+                        if not date_flag:
			
 
				+                            return
			
 
				+                    except Exception as e:
			
 
				+                        AliyunLogger.logging(
			
 
				+                            code="3000",
			
 
				+                            platform=self.platform,
			
 
				+                            mode=self.mode,
			
 
				+                            env=self.env,
			
 
				+                            data=video_obj,
			
 
				+                            message="抓取单条视频异常, 报错原因是: {}".format(e),
			
 
				+                        )
			
 
				+
			
 
				+    def process_video_obj(self, video_obj, user_dict, f):
			
 
				+        """
			
 
				+        process video_obj and extract video_url
			
 
				+        """
			
 
				         new_rule = self.rule_maker(user_dict)
			
 
				         trace_id = self.platform + str(uuid.uuid1())
			
 
				-        item_id = video_obj.get("item_id", "")
			
 
				+        if f == "s":
			
 
				+            item_id = video_obj.get("id_str", "")
			
 
				+        else:
			
 
				+            item_id = video_obj.get("item_id", "")
			
 
				         if not item_id:
			
 
				             AliyunLogger.logging(
			
 
				                 code="2005",
			
@@ -808,8 +971,7 @@ class XiGuaAuthor:
 
				             )
			
 
				             return
			
 
				         # 获取视频信息
			
 
				-        video_dict = self.get_video_info(item_id=item_id, trace_id=trace_id)
			
 
				-        video_dict["out_user_id"] = video_dict["user_id"]
			
 
				+        video_dict = self.get_video_info(item_id=item_id)
			
 
				         video_dict["platform"] = self.platform
			
 
				         video_dict["strategy"] = self.mode
			
 
				         video_dict["out_video_id"] = video_dict["video_id"]
			
@@ -904,101 +1066,51 @@ class XiGuaAuthor:
 
				                         )
			
 
				             return True
			
 
				 
			
 
				-    def get_video_info(self, item_id, trace_id):
			
 
				-        url = "https://www.ixigua.com/api/mixVideo/information?"
			
 
				+    def get_video_info(self, item_id):
			
 
				+        """
			
 
				+        获取视频信息
			
 
				+        """
			
 
				+        url = "https://www.ixigua.com/{}".format(item_id)
			
 
				         headers = {
			
 
				             "accept-encoding": "gzip, deflate",
			
 
				             "accept-language": "zh-CN,zh-Hans;q=0.9",
			
 
				+            "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
			
 
				             "user-agent": FakeUserAgent().random,
			
 
				-            "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
			
 
				-        }
			
 
				-        params = {
			
 
				-            "mixId": str(item_id),
			
 
				-            "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
			
 
				-                       "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
			
 
				-            "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
			
 
				-            "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
			
 
				-                          "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
			
 
				-        }
			
 
				-        cookies = {
			
 
				-            "ixigua-a-s": "1",
			
 
				-            "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
			
 
				-                       "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
			
 
				-            "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
			
 
				-                     "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
			
 
				-            "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
			
 
				-            "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
			
 
				-            "__ac_nonce": "06304878000964fdad287",
			
 
				-            "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
			
 
				-                              "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
			
 
				-            "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
			
 
				-            "_tea_utm_cache_1300": "undefined",
			
 
				-            "support_avif": "false",
			
 
				-            "support_webp": "false",
			
 
				-            "xiguavideopcwebid": "7134967546256016900",
			
 
				-            "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
			
 
				+            "referer": "https://www.ixigua.com/{}/".format(item_id),
			
 
				         }
			
 
				         response = requests.get(
			
 
				             url=url,
			
 
				             headers=headers,
			
 
				-            params=params,
			
 
				-            cookies=cookies,
			
 
				             proxies=tunnel_proxies(),
			
 
				             timeout=5,
			
 
				         )
			
 
				-        if (
			
 
				-                response.status_code != 200
			
 
				-                or "data" not in response.json()
			
 
				-                or response.json()["data"] == {}
			
 
				-        ):
			
 
				-            AliyunLogger.logging(
			
 
				-                code="2000",
			
 
				-                platform=self.platform,
			
 
				-                mode=self.mode,
			
 
				-                env=self.env,
			
 
				-                message="获取视频信息失败",
			
 
				-                trace_id=trace_id,
			
 
				-            )
			
 
				-            return None
			
 
				-        else:
			
 
				-            video_info = (
			
 
				-                response.json()["data"]
			
 
				-                .get("gidInformation", {})
			
 
				-                .get("packerData", {})
			
 
				-                .get("video", {})
			
 
				-            )
			
 
				-            if video_info == {}:
			
 
				-                return None
			
 
				-            video_detail = get_video_url(video_info)
			
 
				 
			
 
				-            video_dict = {
			
 
				+        video_info = extract_info_by_re(response.text)
			
 
				+        video_dict = {
			
 
				                 "video_title": video_info.get("title", ""),
			
 
				-                "video_id": video_info.get("videoResource", {}).get("vid", ""),
			
 
				+                "video_id": video_info.get("video_id"),
			
 
				                 "gid": str(item_id),
			
 
				-                "play_cnt": int(video_info.get("video_watch_count", 0)),
			
 
				-                "like_cnt": int(video_info.get("video_like_count", 0)),
			
 
				-                "comment_cnt": int(get_comment_cnt(item_id)),
			
 
				+                "play_cnt": int(video_info.get("play_count", 0)),
			
 
				+                "like_cnt": int(video_info.get("like_count", 0)),
			
 
				+                "comment_cnt": 0,
			
 
				                 "share_cnt": 0,
			
 
				                 "favorite_cnt": 0,
			
 
				-                "duration": int(video_info.get("video_duration", 0)),
			
 
				-                "video_width": int(video_detail["video_width"]),
			
 
				-                "video_height": int(video_detail["video_height"]),
			
 
				-                "publish_time_stamp": int(video_info.get("video_publish_time", 0)),
			
 
				+                "duration": int(video_info.get("duration", 0)),
			
 
				+                "video_width": 0,
			
 
				+                "video_height": 0,
			
 
				+                "publish_time_stamp": int(video_info.get("publish_time", 0)),
			
 
				                 "publish_time_str": time.strftime(
			
 
				                     "%Y-%m-%d %H:%M:%S",
			
 
				-                    time.localtime(int(video_info.get("video_publish_time", 0))),
			
 
				+                    time.localtime(int(video_info.get("publish_time", 0))),
			
 
				                 ),
			
 
				-                "user_name": video_info.get("user_info", {}).get("name", ""),
			
 
				-                "user_id": str(video_info.get("user_info", {}).get("user_id", "")),
			
 
				                 "avatar_url": str(
			
 
				                     video_info.get("user_info", {}).get("avatar_url", "")
			
 
				                 ),
			
 
				-                "cover_url": video_info.get("poster_url", ""),
			
 
				-                "audio_url": video_detail["audio_url"],
			
 
				-                "video_url": video_detail["video_url"],
			
 
				+                "cover_url": video_info.get("cover_url", ""),
			
 
				+                "video_url": video_info.get("url"),
			
 
				                 "session": f"xigua-search-{int(time.time())}",
			
 
				             }
			
 
				-            return video_dict
			
 
				+        return video_dict
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
@@ -1028,12 +1140,12 @@ if __name__ == "__main__":
 
				             "mode": "author",
			
 
				         },
			
 
				     ]
			
 
				-    # rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
			
 
				-    # XGA = XiGuaAuthor(
			
 
				-    #     platform="xigua",
			
 
				-    #     mode="author",
			
 
				-    #     rule_dict=rule,
			
 
				-    #     env="prod",
			
 
				-    #     user_list=user_list
			
 
				-    # )
			
 
				-    # XGA.get_author_list()
			
 
				+    rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
			
 
				+    XGA = XiGuaAuthor(
			
 
				+        platform="xigua",
			
 
				+        mode="author",
			
 
				+        rule_dict=rule,
			
 
				+        env="prod",
			
 
				+        user_list=user_list
			
 
				+    )
			
 
				+    XGA.get_author_list()
			
--- a/xigua/xigua_author/xigua_dev.py
+++ b/xigua/xigua_author/xigua_dev.py
@@ -0,0 +1,498 @@
 
				+import json
			
 
				+import os
			
 
				+import re
			
 
				+import random
			
 
				+import sys
			
 
				+import string
			
 
				+import time
			
 
				+import uuid
			
 
				+import base64
			
 
				+import requests
			
 
				+from fake_useragent import FakeUserAgent
			
 
				+
			
 
				+from common.mq import MQ
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from common import PiaoQuanPipeline, tunnel_proxies
			
 
				+from common.limit import AuthorLimit
			
 
				+
			
 
				+
			
 
				+def extract_info_by_re(text):
			
 
				+    """
			
 
				+    通过正则表达式获取文本中的信息
			
 
				+    :param text:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 标题
			
 
				+    title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
			
 
				+    if title_match:
			
 
				+        title_content = title_match.group(1)
			
 
				+        title_content = title_content.split(" - ")[0]
			
 
				+        title_content = bytes(title_content, "latin1").decode()
			
 
				+    else:
			
 
				+        title_content = ""
			
 
				+    # video_url
			
 
				+    main_url = re.search(r'("main_url":")(.*?)"', text)[0]
			
 
				+    main_url = main_url.split(":")[1]
			
 
				+    decoded_data = base64.b64decode(main_url)
			
 
				+    try:
			
 
				+        # 尝试使用utf-8解码
			
 
				+        video_url = decoded_data.decode()
			
 
				+    except UnicodeDecodeError:
			
 
				+        # 如果utf-8解码失败，尝试使用其他编码方式
			
 
				+        video_url = decoded_data.decode('latin-1')
			
 
				+
			
 
				+    # video_id
			
 
				+    video_id = re.search(r'"vid":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # like_count
			
 
				+    like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
			
 
				+
			
 
				+    # cover_url
			
 
				+    cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # video_play
			
 
				+    video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
			
 
				+
			
 
				+    # "video_publish_time"
			
 
				+    publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
			
 
				+
			
 
				+    # video_duration
			
 
				+    duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
			
 
				+    return {
			
 
				+        "title": title_content,
			
 
				+        "url": video_url,
			
 
				+        "video_id": video_id,
			
 
				+        "like_count": like_count,
			
 
				+        "cover_url": cover_url,
			
 
				+        "play_count": video_watch_count,
			
 
				+        "publish_time": publish_time,
			
 
				+        "duration": duration
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def random_signature():
			
 
				+    """
			
 
				+    随机生成签名
			
 
				+    """
			
 
				+    src_digits = string.digits  # string_数字
			
 
				+    src_uppercase = string.ascii_uppercase  # string_大写字母
			
 
				+    src_lowercase = string.ascii_lowercase  # string_小写字母
			
 
				+    digits_num = random.randint(1, 6)
			
 
				+    uppercase_num = random.randint(1, 26 - digits_num - 1)
			
 
				+    lowercase_num = 26 - (digits_num + uppercase_num)
			
 
				+    password = (
			
 
				+            random.sample(src_digits, digits_num)
			
 
				+            + random.sample(src_uppercase, uppercase_num)
			
 
				+            + random.sample(src_lowercase, lowercase_num)
			
 
				+    )
			
 
				+    random.shuffle(password)
			
 
				+    new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
			
 
				+    new_password_start = new_password[0:18]
			
 
				+    new_password_end = new_password[-7:]
			
 
				+    if new_password[18] == "8":
			
 
				+        new_password = new_password_start + "w" + new_password_end
			
 
				+    elif new_password[18] == "9":
			
 
				+        new_password = new_password_start + "x" + new_password_end
			
 
				+    elif new_password[18] == "-":
			
 
				+        new_password = new_password_start + "y" + new_password_end
			
 
				+    elif new_password[18] == ".":
			
 
				+        new_password = new_password_start + "z" + new_password_end
			
 
				+    else:
			
 
				+        new_password = new_password_start + "y" + new_password_end
			
 
				+    return new_password
			
 
				+
			
 
				+
			
 
				+def byte_dance_cookie(item_id):
			
 
				+    """
			
 
				+    获取西瓜视频的 cookie
			
 
				+    :param item_id:
			
 
				+    """
			
 
				+    sess = requests.Session()
			
 
				+    sess.headers.update({
			
 
				+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
			
 
				+        'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
			
 
				+    })
			
 
				+
			
 
				+    # 获取 cookies
			
 
				+    sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
			
 
				+    data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
			
 
				+    r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
			
 
				+    # print(r.text)
			
 
				+    return r.cookies.values()[0]
			
 
				+
			
 
				+
			
 
				+class XiGuaAuthor(object):
			
 
				+    """
			
 
				+    西瓜账号爬虫
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, platform, mode, rule_dict, env, user_list):
			
 
				+        self.platform = platform
			
 
				+        self.mode = mode
			
 
				+        self.rule_dict = rule_dict
			
 
				+        self.env = env
			
 
				+        self.user_list = user_list
			
 
				+        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
			
 
				+        self.download_count = 0
			
 
				+        self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
			
 
				+
			
 
				+    def rule_maker(self, account):
			
 
				+        """
			
 
				+        通过不同的账号生成不同的规则
			
 
				+        :param account: 输入的账号信息
			
 
				+        {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
			
 
				+        """
			
 
				+        temp = account['link'].split("_")
			
 
				+        if len(temp) == 1:
			
 
				+            return self.rule_dict
			
 
				+        else:
			
 
				+            flag = temp[-2]
			
 
				+            match flag:
			
 
				+                case "V1":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 100000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.02
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				+                case "V2":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 10000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.01
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				+                case "V3":
			
 
				+                    rule_dict = {
			
 
				+                        "play_cnt": {"min": 5000, "max": 0},
			
 
				+                        'period': {"min": 90, "max": 90},
			
 
				+                        'special': 0.01
			
 
				+                    }
			
 
				+                    return rule_dict
			
 
				+
			
 
				+    def get_author_list(self):
			
 
				+        """
			
 
				+        每轮只抓取定量的数据，到达数量后自己退出
			
 
				+        获取账号列表以及账号信息
			
 
				+        """
			
 
				+        # max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
			
 
				+        for user_dict in self.user_list:
			
 
				+            # if self.download_count <= max_count:
			
 
				+
			
 
				+            flag = user_dict["link"][0]
			
 
				+            print(user_dict)
			
 
				+            print(flag)
			
 
				+            match flag:
			
 
				+                case "V":
			
 
				+                    self.get_video_list(user_dict)
			
 
				+                case "X":
			
 
				+                    self.get_tiny_video_list(user_dict)
			
 
				+                case "h":
			
 
				+                    self.get_video_list(user_dict)
			
 
				+                case "D":
			
 
				+                    self.get_video_list(user_dict)
			
 
				+                case "B":
			
 
				+                    self.get_video_list(user_dict)
			
 
				+                    self.get_tiny_video_list(user_dict)
			
 
				+
			
 
				+            #     time.sleep(random.randint(1, 15))
			
 
				+            # else:
			
 
				+            #     AliyunLogger.logging(
			
 
				+            #         code="2000",
			
 
				+            #         platform=self.platform,
			
 
				+            #         mode=self.mode,
			
 
				+            #         env=self.env,
			
 
				+            #         message="本轮已经抓取足够数量的视频，已经自动退出",
			
 
				+            #     )
			
 
				+            #     return
			
 
				+
			
 
				+    def get_video_list(self, user_dict):
			
 
				+        """
			
 
				+        获取某个账号的视频列表
			
 
				+        账号分为 3 类
			
 
				+        """
			
 
				+        offset = 0
			
 
				+        signature = random_signature()
			
 
				+        link = user_dict['link'].split("_")[-1]
			
 
				+        url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
			
 
				+        while True:
			
 
				+            to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
			
 
				+            params = {
			
 
				+                "to_user_id": to_user_id,
			
 
				+                "offset": str(offset),
			
 
				+                "limit": "30",
			
 
				+                "maxBehotTime": "0",
			
 
				+                "order": "new",
			
 
				+                "isHome": "0",
			
 
				+                "_signature": signature,
			
 
				+            }
			
 
				+            headers = {
			
 
				+                "referer": f'https://www.ixigua.com/home/{link.replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
			
 
				+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
			
 
				+            }
			
 
				+            response = requests.get(
			
 
				+                url=url,
			
 
				+                headers=headers,
			
 
				+                params=params,
			
 
				+                proxies=tunnel_proxies(),
			
 
				+                timeout=5,
			
 
				+            )
			
 
				+            offset += 30
			
 
				+            if "data" not in response.text or response.status_code != 200:
			
 
				+                message = f"get_videoList:{response.text}\n"
			
 
				+                print(message)
			
 
				+                return
			
 
				+            elif not response.json()["data"]["videoList"]:
			
 
				+                message = f"没有更多数据啦~\n"
			
 
				+                print(params)
			
 
				+                return
			
 
				+            else:
			
 
				+                feeds = response.json()["data"]["videoList"]
			
 
				+                for video_obj in feeds:
			
 
				+                    message = "扫描到一条视频"
			
 
				+                    print(message)
			
 
				+                    date_flag = self.process_video_obj(video_obj, user_dict, "l")
			
 
				+                    if not date_flag:
			
 
				+                        return
			
 
				+
			
 
				+    def get_tiny_video_list(self, user_dict):
			
 
				+        """
			
 
				+        获取小视频
			
 
				+        """
			
 
				+        url = "https://www.ixigua.com/api/videov2/hotsoon/video"
			
 
				+        max_behot_time = "0"
			
 
				+        link = user_dict['link'].split("_")[-1]
			
 
				+        to_user_id = str(link.replace("https://www.ixigua.com/home/", ""))
			
 
				+        while True:
			
 
				+            params = {
			
 
				+                "to_user_id": to_user_id,
			
 
				+                "max_behot_time": max_behot_time,
			
 
				+                "_signature": random_signature()
			
 
				+            }
			
 
				+            headers = {
			
 
				+                "referer": "https://www.ixigua.com/{}?&".format(to_user_id),
			
 
				+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
			
 
				+            }
			
 
				+            response = requests.get(
			
 
				+                url=url,
			
 
				+                headers=headers,
			
 
				+                params=params,
			
 
				+                proxies=tunnel_proxies(),
			
 
				+                timeout=5,
			
 
				+            )
			
 
				+            if "data" not in response.text or response.status_code != 200:
			
 
				+                AliyunLogger.logging(
			
 
				+                    code="2000",
			
 
				+                    platform=self.platform,
			
 
				+                    mode=self.mode,
			
 
				+                    env=self.env,
			
 
				+                    message=f"get_videoList:{response.text}\n",
			
 
				+                )
			
 
				+                return
			
 
				+            elif not response.json()["data"]["data"]:
			
 
				+                AliyunLogger.logging(
			
 
				+                    code="2000",
			
 
				+                    platform=self.platform,
			
 
				+                    mode=self.mode,
			
 
				+                    env=self.env,
			
 
				+                    message=f"没有更多数据啦~\n",
			
 
				+                )
			
 
				+                return
			
 
				+            else:
			
 
				+                video_list = response.json()['data']['data']
			
 
				+                max_behot_time = video_list[-1]["max_behot_time"]
			
 
				+                for video_obj in video_list:
			
 
				+                    try:
			
 
				+                        AliyunLogger.logging(
			
 
				+                            code="1001",
			
 
				+                            account=user_dict['uid'],
			
 
				+                            platform=self.platform,
			
 
				+                            mode=self.mode,
			
 
				+                            env=self.env,
			
 
				+                            data=video_obj,
			
 
				+                            message="扫描到一条小视频",
			
 
				+                        )
			
 
				+                        date_flag = self.process_video_obj(video_obj, user_dict, "s")
			
 
				+                        if not date_flag:
			
 
				+                            return
			
 
				+                    except Exception as e:
			
 
				+                        AliyunLogger.logging(
			
 
				+                            code="3000",
			
 
				+                            platform=self.platform,
			
 
				+                            mode=self.mode,
			
 
				+                            env=self.env,
			
 
				+                            data=video_obj,
			
 
				+                            message="抓取单条视频异常, 报错原因是: {}".format(e),
			
 
				+                        )
			
 
				+
			
 
				+    def process_video_obj(self, video_obj, user_dict, f):
			
 
				+        """
			
 
				+        process video_obj and extract video_url
			
 
				+        """
			
 
				+        new_rule = self.rule_maker(user_dict)
			
 
				+        trace_id = self.platform + str(uuid.uuid1())
			
 
				+        if f == "s":
			
 
				+            item_id = video_obj.get("id_str", "")
			
 
				+        else:
			
 
				+            item_id = video_obj.get("item_id", "")
			
 
				+        if not item_id:
			
 
				+            message="无效视频"
			
 
				+            print(message)
			
 
				+            return
			
 
				+        # 获取视频信息
			
 
				+        video_dict = self.get_video_info(item_id=item_id, trace_id=trace_id)
			
 
				+        # video_dict["out_user_id"] = video_dict["user_id"]
			
 
				+        video_dict["platform"] = self.platform
			
 
				+        video_dict["strategy"] = self.mode
			
 
				+        video_dict["out_video_id"] = video_dict["video_id"]
			
 
				+        video_dict["width"] = video_dict["video_width"]
			
 
				+        video_dict["height"] = video_dict["video_height"]
			
 
				+        video_dict["crawler_rule"] = json.dumps(new_rule)
			
 
				+        video_dict["user_id"] = user_dict["uid"]
			
 
				+        video_dict["publish_time"] = video_dict["publish_time_str"]
			
 
				+        video_dict["strategy_type"] = self.mode
			
 
				+        video_dict["update_time_stamp"] = int(time.time())
			
 
				+        if int(time.time()) - video_dict['publish_time_stamp'] > 3600 * 24 * int(
			
 
				+                new_rule.get("period", {}).get("max", 1000)):
			
 
				+            if not video_obj['is_top']:
			
 
				+                """
			
 
				+                非置顶数据发布时间超过才退出
			
 
				+                """
			
 
				+
			
 
				+                message = "发布时间超过{}天".format(
			
 
				+                    int(new_rule.get("period", {}).get("max", 1000))
			
 
				+                )
			
 
				+                print(message)
			
 
				+
			
 
				+                return False
			
 
				+        pipeline = PiaoQuanPipeline(
			
 
				+            platform=self.platform,
			
 
				+            mode=self.mode,
			
 
				+            rule_dict=new_rule,
			
 
				+            env=self.env,
			
 
				+            item=video_dict,
			
 
				+            trace_id=trace_id,
			
 
				+        )
			
 
				+        limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
			
 
				+        print(json.dumps(video_dict, ensure_ascii=False, indent=4))
			
 
				+        # if limit_flag:
			
 
				+        #     title_flag = pipeline.title_flag()
			
 
				+        #     repeat_flag = pipeline.repeat_video()
			
 
				+        #     if title_flag and repeat_flag:
			
 
				+        #         if new_rule.get("special"):
			
 
				+        #             if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
			
 
				+        #                 if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= new_rule['special']:
			
 
				+        #                     print(json.dumps(video_dict, ensure_ascii=False, indent=4))
			
 
				+        #                     # self.mq.send_msg(video_dict)
			
 
				+        #                     self.download_count += 1
			
 
				+        #
			
 
				+        #                     return True
			
 
				+        #                 else:
			
 
				+        #                     message="不满足特殊规则, 点赞量/播放量"
			
 
				+        #                     print(json.dumps(video_dict, ensure_ascii=False, indent=4))
			
 
				+        #                     print(message)
			
 
				+        #                     return False
			
 
				+        #
			
 
				+        #         else:
			
 
				+        #             if int(video_dict['play_cnt']) >= int(new_rule.get("play_cnt", {}).get("min", 100000)):
			
 
				+        #                 self.mq.send_msg(video_dict)
			
 
				+        #                 self.download_count += 1
			
 
				+        #                     message="成功发送 MQ 至 ETL",
			
 
				+        #                 )
			
 
				+        #                 return True
			
 
				+        #             else:
			
 
				+        #                 AliyunLogger.logging(
			
 
				+        #                     code="2008",
			
 
				+        #                     account=user_dict['uid'],
			
 
				+        #                     platform=self.platform,
			
 
				+        #                     mode=self.mode,
			
 
				+        #                     env=self.env,
			
 
				+        #                     message="不满足特殊规则, 播放量",
			
 
				+        #                     data=video_dict
			
 
				+        #                 )
			
 
				+        #     return True
			
 
				+
			
 
				+    def get_video_info(self, item_id, trace_id):
			
 
				+        """
			
 
				+        获取视频信息
			
 
				+        """
			
 
				+        url = "https://www.ixigua.com/{}".format(item_id)
			
 
				+        headers = {
			
 
				+            "accept-encoding": "gzip, deflate",
			
 
				+            "accept-language": "zh-CN,zh-Hans;q=0.9",
			
 
				+            "user-agent": FakeUserAgent().random,
			
 
				+            "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
			
 
				+            "referer": "https://www.ixigua.com/{}/".format(item_id),
			
 
				+        }
			
 
				+        response = requests.get(
			
 
				+            url=url,
			
 
				+            headers=headers,
			
 
				+            proxies=tunnel_proxies(),
			
 
				+            timeout=5,
			
 
				+        )
			
 
				+        video_info = extract_info_by_re(response.text)
			
 
				+        video_dict = {
			
 
				+            "video_title": video_info.get("title", ""),
			
 
				+            "video_id": video_info.get("video_id"),
			
 
				+            "gid": str(item_id),
			
 
				+            "play_cnt": int(video_info.get("play_count", 0)),
			
 
				+            "like_cnt": int(video_info.get("like_count", 0)),
			
 
				+            "comment_cnt": 0,
			
 
				+            "share_cnt": 0,
			
 
				+            "favorite_cnt": 0,
			
 
				+            "duration": int(video_info.get("duration", 0)),
			
 
				+            "video_width": 0,
			
 
				+            "video_height": 0,
			
 
				+            "publish_time_stamp": int(video_info.get("publish_time", 0)),
			
 
				+            "publish_time_str": time.strftime(
			
 
				+                "%Y-%m-%d %H:%M:%S",
			
 
				+                time.localtime(int(video_info.get("publish_time", 0))),
			
 
				+            ),
			
 
				+            "avatar_url": str(
			
 
				+                video_info.get("user_info", {}).get("avatar_url", "")
			
 
				+            ),
			
 
				+            "cover_url": video_info.get("cover_url", ""),
			
 
				+            "video_url": video_info.get("url"),
			
 
				+            "session": f"xigua-search-{int(time.time())}",
			
 
				+        }
			
 
				+        return video_dict
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    user_list = [
			
 
				+        {
			
 
				+            "uid": 6267140,
			
 
				+            "source": "xigua",
			
 
				+            "link": "https://www.ixigua.com/home/2779177225827568",
			
 
				+            "nick_name": "秋晴爱音乐",
			
 
				+            "avatar_url": "",
			
 
				+            "mode": "author",
			
 
				+        },
			
 
				+        {
			
 
				+            "uid": 6267140,
			
 
				+            "source": "xigua",
			
 
				+            "link": "https://www.ixigua.com/home/2885546124776780",
			
 
				+            "nick_name": "朗诵放歌的老山羊",
			
 
				+            "avatar_url": "",
			
 
				+            "mode": "author",
			
 
				+        },
			
 
				+        {
			
 
				+            "uid": 6267140,
			
 
				+            "source": "xigua",
			
 
				+            "link": "https://www.ixigua.com/home/5880938217",
			
 
				+            "nick_name": "天原声疗",
			
 
				+            "avatar_url": "",
			
 
				+            "mode": "author",
			
 
				+        },
			
 
				+    ]
			
 
				+    rule = {'period': {'min': 30, 'max': 30}, 'duration': {'min': 20, 'max': 0}, 'play_cnt': {'min': 100000, 'max': 0}}
			
 
				+    XGA = XiGuaAuthor(
			
 
				+        platform="xigua",
			
 
				+        mode="author",
			
 
				+        rule_dict=rule,
			
 
				+        env="prod",
			
 
				+        user_list=user_list
			
 
				+    )
			
 
				+    XGA.get_author_list()