1 年之前 · eebeaef171
--- a/applications/functions/common.py
+++ b/applications/functions/common.py
@@ -3,6 +3,7 @@
 
				 """
			
 
				 import os
			
 
				 import json
			
 
				+import time
			
 
				 import uuid
			
 
				 import requests
			
 
				 import pymysql
			
@@ -17,21 +18,6 @@ class Functions(object):
 
				     通用工具代码
			
 
				     """
			
 
				 
			
 
				-    # 敏感词逻辑
			
 
				-    @classmethod
			
 
				-    def sensitive_flag(cls, title):
			
 
				-        """
			
 
				-        判断标题是否命中过滤词
			
 
				-        :param title:
			
 
				-        :return:
			
 
				-        """
			
 
				-        sensitive_words = MySQLServer().select_sensitive_words()
			
 
				-        for word in sensitive_words:
			
 
				-            if word in title:
			
 
				-                # title = title.replace(word, "*")
			
 
				-                return False
			
 
				-        return True
			
 
				-
			
 
				     # 自动加入白名单逻辑
			
 
				     @classmethod
			
 
				     def auto_white(cls, root_share_id):
			
@@ -155,7 +141,8 @@ class MySQLServer(object):
 
				         :param trace_id:
			
 
				         :return:
			
 
				         """
			
 
				-        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id, trace_id)
			
 
				+        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id,
			
 
				+                                                                                                           trace_id)
			
 
				         connection = pymysql.connect(
			
 
				             host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址，内网地址
			
 
				             port=3306,  # 端口号
			
@@ -168,20 +155,12 @@ class MySQLServer(object):
 
				         cursor.execute(sql)
			
 
				         out_video_list = cursor.fetchall()
			
 
				         if len(out_video_list) > 0:
			
 
				-            vid_list = [i[0] for i in out_video_list if i[0] != 0]
			
 
				-            vid_list = [vid_list[0]]
			
 
				-            # dir_path = os.path.join(os.getcwd(), 'applications', 'static', "out_videos")
			
 
				-            # os.makedirs(os.path.dirname(dir_path), exist_ok=True)
			
 
				-            # done_list = os.listdir(dir_path)
			
 
				-            # process_list = [
			
 
				-            #     (
			
 
				-            #         i[1],
			
 
				-            #         trace_id,
			
 
				-            #         os.path.join(dir_path, "{}.json".format(i[0]))
			
 
				-            #     ) for i in out_video_list if not "{}.json".format(i[0]) in done_list
			
 
				-            # ]
			
 
				-            # if process_list:
			
 
				-            #     ask_kimi_and_save_to_local(process_list[0])
			
 
				+            if out_video_list[0][0] == 0:
			
 
				+                video_id = cls.search_id_to_video(trace_id)
			
 
				+            else:
			
 
				+                video_id = out_video_list[0][0]
			
 
				+
			
 
				+            vid_list = [video_id]
			
 
				             logging(
			
 
				                 code="2003",
			
 
				                 trace_id=trace_id,
			
@@ -252,6 +231,32 @@ class MySQLServer(object):
 
				         result = [line[0] for line in data]
			
 
				         return result
			
 
				 
			
 
				+    @classmethod
			
 
				+    def search_id_to_video(cls, trace_id):
			
 
				+        """
			
 
				+        通过 search_id 返回 video_id
			
 
				+        :param trace_id:
			
 
				+        :return:
			
 
				+        """
			
 
				+        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id,
			
 
				+                                                                                                           trace_id)
			
 
				+        connection = pymysql.connect(
			
 
				+            host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址，内网地址
			
 
				+            port=3306,  # 端口号
			
 
				+            user="crawler",  # mysql用户名
			
 
				+            passwd="crawler123456@",  # mysql用户登录密码
			
 
				+            db="piaoquan-crawler",  # 数据库名
			
 
				+            charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的，charset指定是utf8
			
 
				+        )
			
 
				+        cursor = connection.cursor()
			
 
				+        cursor.execute(sql)
			
 
				+        out_video_list = cursor.fetchall()
			
 
				+        if int(out_video_list[0][0]) == 0:
			
 
				+            time.sleep(1)
			
 
				+            return cls.search_id_to_video(trace_id)
			
 
				+        else:
			
 
				+            return out_video_list[0][0]
			
 
				+
			
 
				 
			
 
				 class KimiServer(object):
			
 
				     """
			
@@ -364,4 +369,4 @@ class KimiServer(object):
 
				             model="moonshot-v1-8k",
			
 
				         )
			
 
				         response = chat_completion.choices[0].message.content
			
 
				-        return response
			
 
				+        return response
			
--- a/applications/functions/item.py
+++ b/applications/functions/item.py
@@ -1,98 +0,0 @@
 
				-"""
			
 
				-@author: luojunhui
			
 
				-"""
			
 
				-import time
			
 
				-
			
 
				-from applications.functions.common import Functions
			
 
				-
			
 
				-
			
 
				-class VideoItem(object):
			
 
				-    """
			
 
				-    function: 当扫描进一条视频的时候，对该视频的基本信息进行处理，保证发送给 pipeline和 etl 的 video_dict 是正确的
			
 
				-    __init__: 初始化空json 对象，用来存储视频信息
			
 
				-    add_video_info: 把视频信息存储到 item 对象中
			
 
				-    check_item: 检查 item 对象中的各个元素以及处理
			
 
				-    """
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self.item = {}
			
 
				-
			
 
				-    def add_video_info(self, key, value):
			
 
				-        self.item[key] = value
			
 
				-
			
 
				-    def check_item(self):
			
 
				-        """
			
 
				-        判断item 里面的字段，是否符合要求
			
 
				-        字段分为 3 类：
			
 
				-        1. 必须存在数据的字段： ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
			
 
				-        2. 不存在默认为 0 的字段 ：["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
			
 
				-        3. 需要后出理的字段： video_title, publish_time
			
 
				-        """
			
 
				-        if self.item.get("video_title"):
			
 
				-            self.item["video_title"] = Functions().clean_title(self.item["video_title"])
			
 
				-        else:
			
 
				-            return False
			
 
				-        if self.item.get("publish_time_stamp"):
			
 
				-            publish_time_str = time.strftime(
			
 
				-                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
			
 
				-            )
			
 
				-            self.add_video_info("publish_time_str", publish_time_str)
			
 
				-        else:
			
 
				-            publish_time_stamp = int(time.time())
			
 
				-            publish_time_str = time.strftime(
			
 
				-                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
			
 
				-            )
			
 
				-            self.add_video_info("publish_time_stamp", publish_time_stamp)
			
 
				-            self.add_video_info("publish_time_str", publish_time_str)
			
 
				-        self.add_video_info("publish_time", publish_time_str)
			
 
				-        if not self.item.get("update_time_stamp"):
			
 
				-            self.add_video_info("update_time_stamp", int(time.time()))
			
 
				-
			
 
				-        # 如果不存在，默认值为 0
			
 
				-        config_keys = [
			
 
				-            "duration",
			
 
				-            "play_cnt",
			
 
				-            "like_cnt",
			
 
				-            "comment_cnt",
			
 
				-            "share_cnt",
			
 
				-            "width",
			
 
				-            "height",
			
 
				-        ]
			
 
				-        for config_key in config_keys:
			
 
				-            if self.item.get(config_key):
			
 
				-                continue
			
 
				-            else:
			
 
				-                self.add_video_info(config_key, 0)
			
 
				-
			
 
				-        # 必须存在的元素，若不存在则会报错
			
 
				-        must_keys = [
			
 
				-            "video_id",
			
 
				-            "user_id",
			
 
				-            "user_name",
			
 
				-            "out_video_id",
			
 
				-            "session",
			
 
				-            "video_url",
			
 
				-            "cover_url",
			
 
				-            "platform",
			
 
				-            "strategy",
			
 
				-        ]
			
 
				-        """
			
 
				-        video_id, out_video_id 均为站外视频 id
			
 
				-        usr_id: 站内用户 id
			
 
				-        out_user_id: 站外用户 id
			
 
				-        user_name: 站外用户名称
			
 
				-        """
			
 
				-        for m_key in must_keys:
			
 
				-            if self.item.get(m_key):
			
 
				-                continue
			
 
				-            else:
			
 
				-                # print(m_key)
			
 
				-                return False
			
 
				-        return True
			
 
				-
			
 
				-    def produce_item(self):
			
 
				-        flag = self.check_item()
			
 
				-        if flag:
			
 
				-            return self.item
			
 
				-        else:
			
 
				-            return False
			
--- a/applications/functions/video_item.py
+++ b/applications/functions/video_item.py
@@ -0,0 +1,244 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import time
			
 
				+from applications.functions.mq import MQ
			
 
				+from applications.functions.log import logging
			
 
				+from applications.functions.common import Functions
			
 
				+
			
 
				+
			
 
				+class VideoItem(object):
			
 
				+    """
			
 
				+    function: 当扫描进一条视频的时候，对该视频的基本信息进行处理，保证发送给 pipeline和 etl 的 video_dict 是正确的
			
 
				+    __init__: 初始化空json 对象，用来存储视频信息
			
 
				+    add_video_info: 把视频信息存储到 item 对象中
			
 
				+    check_item: 检查 item 对象中的各个元素以及处理
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.item = {}
			
 
				+
			
 
				+    def add_video_info(self, key, value):
			
 
				+        """
			
 
				+        insert or update video info
			
 
				+        :param key:
			
 
				+        :param value:
			
 
				+        """
			
 
				+        self.item[key] = value
			
 
				+
			
 
				+    def check_item(self):
			
 
				+        """
			
 
				+        判断item 里面的字段，是否符合要求
			
 
				+        字段分为 3 类：
			
 
				+        1. 必须存在数据的字段： ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
			
 
				+        2. 不存在默认为 0 的字段 ：["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
			
 
				+        3. 需要后出理的字段： video_title, publish_time
			
 
				+        """
			
 
				+        if self.item.get("video_title"):
			
 
				+            self.item["video_title"] = Functions().clean_title(self.item["video_title"])
			
 
				+        else:
			
 
				+            return False
			
 
				+        if self.item.get("publish_time_stamp"):
			
 
				+            publish_time_str = time.strftime(
			
 
				+                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
			
 
				+            )
			
 
				+            self.add_video_info("publish_time_str", publish_time_str)
			
 
				+        else:
			
 
				+            publish_time_stamp = int(time.time())
			
 
				+            publish_time_str = time.strftime(
			
 
				+                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
			
 
				+            )
			
 
				+            self.add_video_info("publish_time_stamp", publish_time_stamp)
			
 
				+            self.add_video_info("publish_time_str", publish_time_str)
			
 
				+        self.add_video_info("publish_time", publish_time_str)
			
 
				+        if not self.item.get("update_time_stamp"):
			
 
				+            self.add_video_info("update_time_stamp", int(time.time()))
			
 
				+
			
 
				+        # 如果不存在，默认值为 0
			
 
				+        config_keys = [
			
 
				+            "duration",
			
 
				+            "play_cnt",
			
 
				+            "like_cnt",
			
 
				+            "comment_cnt",
			
 
				+            "share_cnt",
			
 
				+            "width",
			
 
				+            "height",
			
 
				+        ]
			
 
				+        for config_key in config_keys:
			
 
				+            if self.item.get(config_key):
			
 
				+                continue
			
 
				+            else:
			
 
				+                self.add_video_info(config_key, 0)
			
 
				+
			
 
				+        # 必须存在的元素，若不存在则会报错
			
 
				+        must_keys = [
			
 
				+            "video_id",
			
 
				+            "user_id",
			
 
				+            "user_name",
			
 
				+            "out_video_id",
			
 
				+            "session",
			
 
				+            "video_url",
			
 
				+            "cover_url",
			
 
				+            "platform",
			
 
				+            "strategy",
			
 
				+        ]
			
 
				+        """
			
 
				+        video_id, out_video_id 均为站外视频 id
			
 
				+        usr_id: 站内用户 id
			
 
				+        out_user_id: 站外用户 id
			
 
				+        user_name: 站外用户名称
			
 
				+        """
			
 
				+        for m_key in must_keys:
			
 
				+            if self.item.get(m_key):
			
 
				+                continue
			
 
				+            else:
			
 
				+                # print(m_key)
			
 
				+                return False
			
 
				+        return True
			
 
				+
			
 
				+    def produce_item(self):
			
 
				+        """
			
 
				+        item producer
			
 
				+        :return:
			
 
				+        """
			
 
				+        flag = self.check_item()
			
 
				+        if flag:
			
 
				+            return self.item
			
 
				+        else:
			
 
				+            return False
			
 
				+
			
 
				+
			
 
				+class VideoProducer(object):
			
 
				+    """
			
 
				+    处理视频
			
 
				+    todo: baidu && xigua video process
			
 
				+    """
			
 
				+
			
 
				+    @classmethod
			
 
				+    def wx_video_producer(cls, video_obj, user, trace_id):
			
 
				+        """
			
 
				+            异步处理微信 video_obj
			
 
				+            公众号和站内账号一一对应
			
 
				+            :param trace_id:
			
 
				+            :param user:
			
 
				+            :param video_obj:
			
 
				+            :return:
			
 
				+        """
			
 
				+        platform = "weixin_search"
			
 
				+        publish_time_stamp = int(video_obj['pubTime'])
			
 
				+        item = VideoItem()
			
 
				+        item.add_video_info("user_id", user["uid"])
			
 
				+        item.add_video_info("user_name", user["nick_name"])
			
 
				+        item.add_video_info("video_id", video_obj['hashDocID'])
			
 
				+        item.add_video_info("video_title", trace_id)
			
 
				+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
			
 
				+        item.add_video_info("video_url", video_obj["videoUrl"])
			
 
				+        item.add_video_info("cover_url", video_obj["image"])
			
 
				+        item.add_video_info("out_video_id", video_obj['hashDocID'])
			
 
				+        item.add_video_info("out_user_id", trace_id)
			
 
				+        item.add_video_info("platform", platform)
			
 
				+        item.add_video_info("strategy", "search")
			
 
				+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
			
 
				+        mq_obj = item.produce_item()
			
 
				+        return mq_obj
			
 
				+
			
 
				+    @classmethod
			
 
				+    def baidu_video_producer(cls, video_obj, user, trace_id):
			
 
				+        """
			
 
				+        处理好看视频的 video_info
			
 
				+        :param video_obj:
			
 
				+        :param user:
			
 
				+        :param trace_id:
			
 
				+        :return:
			
 
				+        """
			
 
				+        platform = "baidu_search"
			
 
				+        publish_time_stamp = int(video_obj['publish_time'])
			
 
				+        item = VideoItem()
			
 
				+        item.add_video_info("user_id", user["uid"])
			
 
				+        item.add_video_info("user_name", user["nick_name"])
			
 
				+        item.add_video_info("video_id", video_obj['id'])
			
 
				+        item.add_video_info("video_title", trace_id)
			
 
				+        item.add_video_info("publish_time_stamp", publish_time_stamp)
			
 
				+        item.add_video_info("video_url", video_obj["playurl"])
			
 
				+        item.add_video_info("cover_url", video_obj["poster"])
			
 
				+        item.add_video_info("out_video_id", video_obj['id'])
			
 
				+        item.add_video_info("out_user_id", trace_id)
			
 
				+        item.add_video_info("like_cnt", video_obj['like'])
			
 
				+        item.add_video_info("play_cnt", video_obj['playcnt'])
			
 
				+        item.add_video_info("duration", video_obj['duration'])
			
 
				+        item.add_video_info("platform", platform)
			
 
				+        item.add_video_info("strategy", "search")
			
 
				+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
			
 
				+        mq_obj = item.produce_item()
			
 
				+        return mq_obj
			
 
				+
			
 
				+    @classmethod
			
 
				+    def xg_video_producer(cls, video_obj, user, trace_id):
			
 
				+        """
			
 
				+        西瓜搜索
			
 
				+        :param video_obj:
			
 
				+        :param user:
			
 
				+        :param trace_id:
			
 
				+        :return:
			
 
				+        """
			
 
				+        platform = "xg_search"
			
 
				+        publish_time_stamp = int(video_obj['publish_time'])
			
 
				+        item = VideoItem()
			
 
				+        item.add_video_info("user_id", user["uid"])
			
 
				+        item.add_video_info("user_name", user["nick_name"])
			
 
				+        item.add_video_info("video_id", video_obj['video_id'])
			
 
				+        item.add_video_info("video_title", trace_id)
			
 
				+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
			
 
				+        item.add_video_info("video_url", video_obj["video_url"])
			
 
				+        item.add_video_info("cover_url", video_obj["cover_url"])
			
 
				+        item.add_video_info("out_video_id", video_obj['video_id'])
			
 
				+        item.add_video_info("play_cnt", video_obj['play_cnt'])
			
 
				+        item.add_video_info("duration", video_obj['duration'])
			
 
				+        item.add_video_info("like_cnt", video_obj['like_cnt'])
			
 
				+        item.add_video_info("out_user_id", trace_id)
			
 
				+        item.add_video_info("platform", platform)
			
 
				+        item.add_video_info("strategy", "search")
			
 
				+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
			
 
				+        mq_obj = item.produce_item()
			
 
				+        return mq_obj
			
 
				+
			
 
				+
			
 
				+def video_mq_sender(video_obj, user, trace_id, platform):
			
 
				+    """
			
 
				+    异步处理微信 video_obj
			
 
				+    公众号和站内账号一一对应
			
 
				+    :param platform:
			
 
				+    :param user:
			
 
				+    :param trace_id:
			
 
				+    :param video_obj:
			
 
				+    :return:
			
 
				+    """
			
 
				+    ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
			
 
				+    Video = VideoProducer()
			
 
				+    if platform == "xg_search":
			
 
				+        mq_obj = Video.xg_video_producer(
			
 
				+            video_obj=video_obj,
			
 
				+            user=user,
			
 
				+            trace_id=trace_id,
			
 
				+        )
			
 
				+    elif platform == "baidu_search":
			
 
				+        mq_obj = Video.baidu_video_producer(
			
 
				+            video_obj=video_obj,
			
 
				+            user=user,
			
 
				+            trace_id=trace_id,
			
 
				+        )
			
 
				+    elif platform == "wx_search":
			
 
				+        mq_obj = Video.wx_video_producer(
			
 
				+            video_obj=video_obj,
			
 
				+            user=user,
			
 
				+            trace_id=trace_id,
			
 
				+        )
			
 
				+    else:
			
 
				+        mq_obj = {}
			
 
				+    ETL_MQ.send_msg(params=mq_obj)
			
 
				+    logging(
			
 
				+        code="6002",
			
 
				+        info="发送消息至 ETL",
			
 
				+        data=mq_obj,
			
 
				+        trace_id=trace_id
			
 
				+    )
			
--- a/applications/routes.py
+++ b/applications/routes.py
@@ -35,7 +35,7 @@ async def search_videos_from_the_web():
 
				     :return:
			
 
				     """
			
 
				     params = await request.get_json()
			
 
				-    title = params['title']
			
 
				+    title = params['title'].replace("【非头次】", "")
			
 
				     gh_id = params['ghId']
			
 
				     trace_id = "search-{}-{}".format(str(uuid.uuid4()), str(int(time.time())))
			
 
				     params['trace_id'] = trace_id
			
@@ -46,36 +46,36 @@ async def search_videos_from_the_web():
 
				         function="search_videos_from_the_web",
			
 
				         trace_id=trace_id
			
 
				     )
			
 
				-    try:
			
 
				-        title_p = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
			
 
				-        if os.path.exists(title_p):
			
 
				-            logging(
			
 
				-                code="2001",
			
 
				-                info="该标题已经被 kimi 处理过，跳过请求 kimi 操作--- {}".format(title),
			
 
				-                function="search_videos_from_the_web",
			
 
				-                trace_id=trace_id
			
 
				-            )
			
 
				-        else:
			
 
				-            KimiServer().ask_kimi_and_save_to_local((title, trace_id, title_p))
			
 
				-        await asyncio.sleep(1)
			
 
				-        kimi_title = KimiServer().kimi_title(title)
			
 
				-        search_videos(
			
 
				-            title=title,
			
 
				-            video_path=title_p,
			
 
				-            trace_id=trace_id,
			
 
				-            gh_id=gh_id,
			
 
				+    # try:
			
 
				+    title_p = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
			
 
				+    if os.path.exists(title_p):
			
 
				+        logging(
			
 
				+            code="2001",
			
 
				+            info="该标题已经被 kimi 处理过，跳过请求 kimi 操作--- {}".format(title),
			
 
				+            function="search_videos_from_the_web",
			
 
				+            trace_id=trace_id
			
 
				         )
			
 
				-        res = {
			
 
				-            "trace_id": trace_id,
			
 
				-            "code": 0,
			
 
				-            "kimi_title": kimi_title
			
 
				-        }
			
 
				-    except Exception as e:
			
 
				-        res = {
			
 
				-            "trace_id": trace_id,
			
 
				-            "code": 1,
			
 
				-            "message": str(e)
			
 
				-        }
			
 
				+    else:
			
 
				+        KimiServer().ask_kimi_and_save_to_local((title, trace_id, title_p))
			
 
				+    await asyncio.sleep(1)
			
 
				+    kimi_title = KimiServer().kimi_title(title)
			
 
				+    search_videos(
			
 
				+        title=title,
			
 
				+        video_path=title_p,
			
 
				+        trace_id=trace_id,
			
 
				+        gh_id=gh_id,
			
 
				+    )
			
 
				+    res = {
			
 
				+        "trace_id": trace_id,
			
 
				+        "code": 0,
			
 
				+        "kimi_title": kimi_title
			
 
				+    }
			
 
				+    # except Exception as e:
			
 
				+    #     res = {
			
 
				+    #         "trace_id": trace_id,
			
 
				+    #         "code": 1,
			
 
				+    #         "message": str(e)
			
 
				+    #     }
			
 
				     return jsonify(res)
			
 
				 
			
 
				 
			
--- a/applications/schedule/process_schedule.py
+++ b/applications/schedule/process_schedule.py
@@ -3,7 +3,6 @@
 
				 对请求进行操作
			
 
				 """
			
 
				 import json
			
 
				-import time
			
 
				 import os
			
 
				 
			
 
				 from applications.match_alg import best_choice
			
--- a/applications/schedule/search_schedule.py
+++ b/applications/schedule/search_schedule.py
@@ -3,69 +3,14 @@
 
				 调用接口在微信内搜索视频
			
 
				 """
			
 
				 import json
			
 
				-import time
			
 
				-import requests
			
 
				 
			
 
				-from applications.functions.mq import MQ
			
 
				-from applications.functions.log import logging
			
 
				+from applications.search import *
			
 
				 from applications.static.config import gh_id_dict
			
 
				-from applications.functions.item import VideoItem
			
 
				-
			
 
				-
			
 
				-def wx_search(keys):
			
 
				-    """
			
 
				-    WeChat search
			
 
				-    :param keys:
			
 
				-    :return:
			
 
				-    """
			
 
				-    url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
			
 
				-    payload = json.dumps({
			
 
				-        "keyword": keys,
			
 
				-        "cursor": "0",
			
 
				-        "content_type": "video"
			
 
				-    })
			
 
				-    headers = {
			
 
				-        'Content-Type': 'application/json'
			
 
				-    }
			
 
				-    response = requests.request("POST", url, headers=headers, data=payload)
			
 
				-    return response.json()
			
 
				-
			
 
				-
			
 
				-def process_weixin_video_obj(video_obj, user, trace_id):
			
 
				-    """
			
 
				-    异步处理微信 video_obj
			
 
				-    公众号和站内账号一一对应
			
 
				-    :param trace_id:
			
 
				-    :param user:
			
 
				-    :param video_obj:
			
 
				-    :return:
			
 
				-    """
			
 
				-    ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
			
 
				-    platform = "weixin_search"
			
 
				-    publish_time_stamp = int(video_obj['pubTime'])
			
 
				-    item = VideoItem()
			
 
				-    item.add_video_info("user_id", user["uid"])
			
 
				-    item.add_video_info("user_name", user["nick_name"])
			
 
				-    item.add_video_info("video_id", video_obj['hashDocID'])
			
 
				-    item.add_video_info("video_title", trace_id)
			
 
				-    item.add_video_info("publish_time_stamp", int(publish_time_stamp))
			
 
				-    item.add_video_info("video_url", video_obj["videoUrl"])
			
 
				-    item.add_video_info("cover_url", video_obj["image"])
			
 
				-    item.add_video_info("out_video_id", video_obj['hashDocID'])
			
 
				-    item.add_video_info("out_user_id", trace_id)
			
 
				-    item.add_video_info("platform", platform)
			
 
				-    item.add_video_info("strategy", "search")
			
 
				-    item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
			
 
				-    mq_obj = item.produce_item()
			
 
				-    ETL_MQ.send_msg(params=mq_obj)
			
 
				-    logging(
			
 
				-        code="6002",
			
 
				-        info="发送消息至 ETL",
			
 
				-        data=mq_obj
			
 
				-    )
			
 
				+from applications.functions.log import logging
			
 
				+from applications.functions.video_item import video_mq_sender
			
 
				 
			
 
				 
			
 
				-def return_video(video_path, title, trace_id):
			
 
				+def recall_search_video(video_path, title, trace_id):
			
 
				     """
			
 
				     search and send msg to ETL
			
 
				     :param trace_id:
			
@@ -76,81 +21,45 @@ def return_video(video_path, title, trace_id):
 
				     with open(video_path, encoding='utf-8') as f:
			
 
				         my_obj = json.loads(f.read())
			
 
				     if my_obj:
			
 
				-        # 三者都搜索，优先搜索 title
			
 
				-        title_result = wx_search(keys=title)
			
 
				-        if title_result['msg'] == '未知错误':
			
 
				-            logging(
			
 
				-                code="7001",
			
 
				-                info="通过标题搜索失败---{}".format(title),
			
 
				-                trace_id=trace_id
			
 
				-            )
			
 
				-        else:
			
 
				-            obj_list = title_result['data']['data']
			
 
				-            if obj_list:
			
 
				-                return obj_list[0]
			
 
				-            # for obj in obj_list:
			
 
				-            #     try:
			
 
				-            #         title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
			
 
				-            #                                                                                      '').replace("#",
			
 
				-            #                                                                                             "")
			
 
				-            #         if Functions().sensitive_flag(title):
			
 
				-            #             return obj
			
 
				-            #         else:
			
 
				-            #             continue
			
 
				-            #     except Exception as e:
			
 
				-            #         print(e)
			
 
				-            #         continue
			
 
				-
			
 
				-        # # search_keys
			
 
				-        search_keys_result = wx_search(keys=my_obj['search_keys'][0])
			
 
				-        if search_keys_result['msg'] == '未知错误':
			
 
				-            logging(
			
 
				-                code="7001",
			
 
				-                info="通过搜索词搜索失败---{}".format(title),
			
 
				-                trace_id=trace_id
			
 
				-            )
			
 
				+        wx_result = wx_search(keys=title)
			
 
				+        if wx_result:
			
 
				+            return {
			
 
				+                "platform": "wx_search",
			
 
				+                "result": wx_result[0]
			
 
				+            }
			
 
				         else:
			
 
				-            obj_list = search_keys_result['data']['data']
			
 
				-            if obj_list:
			
 
				-                return obj_list[0]
			
 
				-        #     for obj in obj_list:
			
 
				-        #         try:
			
 
				-        #             title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
			
 
				-        #                                                                                              '').replace("#",
			
 
				-        #                                                                                                          "")
			
 
				-        #             if Functions().sensitive_flag(title):
			
 
				-        #                 return obj
			
 
				-        #             else:
			
 
				-        #                 continue
			
 
				-        #         except Exception as e:
			
 
				-        #             print(e)
			
 
				-        #             continue
			
 
				-
			
 
				-        # theme
			
 
				-        theme_result = wx_search(keys=my_obj['theme'])
			
 
				-        if theme_result['msg'] == '未知错误':
			
 
				             logging(
			
 
				                 code="7001",
			
 
				-                info="通过主题搜索失败---{}".format(title),
			
 
				+                info="通过微信搜索失败---{}".format(title),
			
 
				                 trace_id=trace_id
			
 
				             )
			
 
				-        else:
			
 
				-            obj_list = theme_result['data']['data']
			
 
				-            if obj_list:
			
 
				-                return obj_list[0]
			
 
				-            # for obj in obj_list:
			
 
				-            #     try:
			
 
				-            #         title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
			
 
				-            #                                                                                          '').replace("#",
			
 
				-            #                                                                                                      "")
			
 
				-            #         if Functions().sensitive_flag(title):
			
 
				-            #             return obj
			
 
				-            #         else:
			
 
				-            #             continue
			
 
				-            #     except Exception as e:
			
 
				-            #         print(e)
			
 
				-            #         continue
			
 
				-        return None
			
 
				+            # 微信搜不到的话，采用好看视频搜索
			
 
				+            baidu_result = hksp_search(key=title)
			
 
				+            if baidu_result:
			
 
				+                return {
			
 
				+                    "platform": "baidu_search",
			
 
				+                    "result": baidu_result[0]
			
 
				+                }
			
 
				+            else:
			
 
				+                # 若好看视频未搜到，则采用西瓜搜索
			
 
				+                logging(
			
 
				+                    code="7001",
			
 
				+                    info="通过baidu搜索失败---{}".format(title),
			
 
				+                    trace_id=trace_id
			
 
				+                )
			
 
				+                xigua_result = xigua_search(title)
			
 
				+                if xigua_result:
			
 
				+                    return {
			
 
				+                        "platform": "xg_search",
			
 
				+                        "result": xigua_result[0]
			
 
				+                    }
			
 
				+                else:
			
 
				+                    logging(
			
 
				+                        code="7001",
			
 
				+                        info="通过西瓜搜索失败---{}".format(title),
			
 
				+                        trace_id=trace_id
			
 
				+                    )
			
 
				+                    return None
			
 
				     else:
			
 
				         logging(
			
 
				             code="7000",
			
@@ -169,18 +78,21 @@ def search_videos(video_path, title, trace_id, gh_id):
 
				     :param trace_id:
			
 
				     :return:
			
 
				     """
			
 
				-    video_obj = return_video(video_path, title, trace_id)
			
 
				-    if video_obj:
			
 
				+    recall_obj = recall_search_video(video_path, title, trace_id)
			
 
				+    platform = recall_obj["platform"]
			
 
				+    recall_video = recall_obj["result"]
			
 
				+    if recall_video:
			
 
				         logging(
			
 
				             code="7002",
			
 
				-            info="视频搜索成功",
			
 
				+            info="视频搜索成功, 搜索平台为--{}".format(platform),
			
 
				             trace_id=trace_id,
			
 
				-            data=video_obj
			
 
				+            data=recall_video
			
 
				         )
			
 
				-        process_weixin_video_obj(
			
 
				-            video_obj=video_obj['items'][0],
			
 
				+        video_mq_sender(
			
 
				+            video_obj=recall_video,
			
 
				             user=gh_id_dict.get(gh_id),
			
 
				-            trace_id=trace_id
			
 
				+            trace_id=trace_id,
			
 
				+            platform=platform
			
 
				         )
			
 
				     else:
			
 
				         logging(
			
--- a/applications/search/hksp_search.py
+++ b/applications/search/hksp_search.py
@@ -7,6 +7,8 @@ import urllib.parse
 
				 import time
			
 
				 import hashlib
			
 
				 
			
 
				+from applications.functions.common import MySQLServer
			
 
				+
			
 
				 
			
 
				 def get_video_detail(video_id):
			
 
				     """
			
@@ -35,7 +37,6 @@ def get_video_detail(video_id):
 
				         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
			
 
				     }
			
 
				     response = requests.request("GET", url, headers=headers, params=params).json()
			
 
				-    # print(json.dumps(response['data']['apiData']['curVideoMeta'], ensure_ascii=False, indent=4))
			
 
				     return response['data']['apiData']['curVideoMeta']
			
 
				 
			
 
				 
			
@@ -43,6 +44,19 @@ def hksp_search(key):
 
				     """
			
 
				     好看视频搜索爬虫
			
 
				     """
			
 
				+    sensitive_words = MySQLServer().select_sensitive_words()
			
 
				+
			
 
				+    def sensitive_flag(s_words, ori_title):
			
 
				+        """
			
 
				+        :param ori_title:
			
 
				+        :param s_words:
			
 
				+        :return:
			
 
				+        """
			
 
				+        for word in s_words:
			
 
				+            if word in ori_title:
			
 
				+                return False
			
 
				+        return True
			
 
				+
			
 
				     timestamp_seconds = time.time()
			
 
				     timestamp_milliseconds = int(timestamp_seconds * 1000)
			
 
				     url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
			
@@ -63,32 +77,23 @@ def hksp_search(key):
 
				         'authority': 'haokan.baidu.com',
			
 
				         'accept': '*/*',
			
 
				         'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
			
 
				-        'cookie': "BIDUPSID='",
			
 
				-        # 'referer': 'https://haokan.baidu.com/web/search/page?query=%E8%80%81%E4%BA%BA',
			
 
				-        'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
			
 
				-        'sec-ch-ua-mobile': '?0',
			
 
				-        'sec-ch-ua-platform': '"macOS"',
			
 
				-        'sec-fetch-dest': 'empty',
			
 
				-        'sec-fetch-mode': 'cors',
			
 
				-        'sec-fetch-site': 'same-origin',
			
 
				+        'cookie': "BIDUPSID=",
			
 
				         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
			
 
				         'x-requested-with': 'xmlhttprequest',
			
 
				     }
			
 
				     # 发送GET请求
			
 
				     response = requests.get(url, headers=headers, params=params).json()
			
 
				-    # print(json.dumps(response, ensure_ascii=False, indent=4))
			
 
				     data_list = response['data']['list']
			
 
				+
			
 
				     L = []
			
 
				-    for data in data_list[:5]:
			
 
				+    for data in data_list:
			
 
				         try:
			
 
				             video_id = data['vid']
			
 
				             res = get_video_detail(video_id)
			
 
				-            temp = ["haokanshipin", res['title'], res['playurl'], "https://haokan.baidu.com/v?vid={}".format(video_id)]
			
 
				-            L.append(temp)
			
 
				+            if sensitive_flag(sensitive_words, ['title']) and int(res['duration']) <= 300:
			
 
				+                L.append(res)
			
 
				+            else:
			
 
				+                continue
			
 
				         except:
			
 
				             pass
			
 
				     return L
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    hksp_search("美国竟对中国提出4个荒唐的条件，真是好大的口气")
			
--- a/applications/search/weixin_search.py
+++ b/applications/search/weixin_search.py
@@ -4,6 +4,8 @@
 
				 import json
			
 
				 import requests
			
 
				 
			
 
				+from applications.functions.common import MySQLServer
			
 
				+
			
 
				 
			
 
				 def wx_search(keys):
			
 
				     """
			
@@ -11,6 +13,20 @@ def wx_search(keys):
 
				     :param keys:
			
 
				     :return:
			
 
				     """
			
 
				+
			
 
				+    sensitive_words = MySQLServer().select_sensitive_words()
			
 
				+
			
 
				+    def sensitive_flag(s_words, ori_title):
			
 
				+        """
			
 
				+        :param ori_title:
			
 
				+        :param s_words:
			
 
				+        :return:
			
 
				+        """
			
 
				+        for word in s_words:
			
 
				+            if word in ori_title:
			
 
				+                return False
			
 
				+        return True
			
 
				+
			
 
				     url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
			
 
				     payload = json.dumps({
			
 
				         "keyword": keys,
			
@@ -20,5 +36,23 @@ def wx_search(keys):
 
				     headers = {
			
 
				         'Content-Type': 'application/json'
			
 
				     }
			
 
				-    response = requests.request("POST", url, headers=headers, data=payload)
			
 
				-    return response.json()
			
 
				+    response = requests.request("POST", url, headers=headers, data=payload).json()
			
 
				+    if response['msg'] == '未知错误':
			
 
				+        return []
			
 
				+    else:
			
 
				+        L = []
			
 
				+        if response['data']:
			
 
				+            video_list = response['data']['data']
			
 
				+            for video in video_list:
			
 
				+                try:
			
 
				+                    video_info = video['items'][0]
			
 
				+                    title = video_info['title']
			
 
				+                    duration_str = video_info['duration']
			
 
				+                    dr = int(duration_str.split(":")[0].strip()) + int(duration_str.split(":")[1].strip())
			
 
				+                    if sensitive_flag(sensitive_words, title) and dr <= 300:
			
 
				+                        L.append(video_info)
			
 
				+                    else:
			
 
				+                        continue
			
 
				+                except:
			
 
				+                    pass
			
 
				+        return L
			
--- a/applications/search/xigua_search.py
+++ b/applications/search/xigua_search.py
@@ -4,183 +4,188 @@
 
				 """
			
 
				 import re
			
 
				 import json
			
 
				-import time
			
 
				-import random
			
 
				 import base64
			
 
				+import requests
			
 
				 import urllib.parse
			
 
				 
			
 
				-import requests
			
 
				 from lxml import etree
			
 
				 from Crypto.Cipher import AES
			
 
				 from Crypto.Util.Padding import unpad
			
 
				 from fake_useragent import FakeUserAgent
			
 
				 
			
 
				-
			
 
				-def byte_dance_cookie(item_id):
			
 
				-    """
			
 
				-    获取西瓜视频的 cookie
			
 
				-    :param item_id:
			
 
				-    """
			
 
				-    sess = requests.Session()
			
 
				-    sess.headers.update({
			
 
				-        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
			
 
				-        'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
			
 
				-    })
			
 
				-
			
 
				-    # 获取 cookies
			
 
				-    sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
			
 
				-    data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
			
 
				-    r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
			
 
				-    # print(r.text)
			
 
				-    return r.cookies.values()[0]
			
 
				+from applications.functions.common import MySQLServer
			
 
				 
			
 
				 
			
 
				-def aes_decrypt(data: str, key: str) -> str:
			
 
				+class XiGuaFunctions(object):
			
 
				     """
			
 
				-    XiGua AES decrypt
			
 
				-    :param data:
			
 
				-    :param key:
			
 
				-    :return:
			
 
				+    XiGuaSearch Class
			
 
				     """
			
 
				-    password = key.encode()
			
 
				-    iv = password[:16]
			
 
				-    try:
			
 
				-        ct = base64.b64decode(data.encode())
			
 
				-        cipher = AES.new(password, AES.MODE_CBC, iv)
			
 
				-        pt = unpad(cipher.decrypt(ct), AES.block_size)
			
 
				-        return base64.b64decode(pt).decode()
			
 
				-    except Exception as e:
			
 
				-        print("Incorrect decryption {}".format(e))
			
 
				-        return None
			
 
				-
			
 
				 
			
 
				-def extract_video_url(text):
			
 
				-    """
			
 
				-    获取视频 video_url
			
 
				-    :param text:
			
 
				-    :return:
			
 
				-    """
			
 
				-    HTML = etree.HTML(text)
			
 
				-    str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
			
 
				-    json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
			
 
				-    Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
			
 
				-    # python中不规则的定义
			
 
				-    for I in Irregulars:
			
 
				-        if I in ['=false', '=true']:
			
 
				-            json_2 = json_2.replace(I, '=' + I[1:].capitalize())
			
 
				+    @classmethod
			
 
				+    def tunnel_proxies(cls):
			
 
				+        """
			
 
				+            快代理方法
			
 
				+            :return:
			
 
				+            """
			
 
				+        tunnel = "q796.kdltps.com:15818"
			
 
				+        username = "t17772369458618"
			
 
				+        password = "5zqcjkmy"
			
 
				+        proxies = {
			
 
				+            "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
			
 
				+            "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
			
 
				+        }
			
 
				+        return proxies
			
 
				+
			
 
				+    @classmethod
			
 
				+    def byte_dance_cookie(cls, item_id):
			
 
				+        """
			
 
				+        获取西瓜视频的 cookie
			
 
				+        :param item_id:
			
 
				+        """
			
 
				+        sess = requests.Session()
			
 
				+        sess.headers.update({
			
 
				+            'user-agent': FakeUserAgent().chrome,
			
 
				+            'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
			
 
				+        })
			
 
				+
			
 
				+        # 获取 cookies
			
 
				+        sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
			
 
				+        data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
			
 
				+        r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
			
 
				+        if r.json()['redirect_url']:
			
 
				+            requests.get(
			
 
				+                url=r.json()['redirect_url']
			
 
				+            )
			
 
				+        return r.cookies.values()[0]
			
 
				+
			
 
				+    @classmethod
			
 
				+    def aes_decrypt(cls, data, key):
			
 
				+        """
			
 
				+        XiGua AES decrypt
			
 
				+        :param data:
			
 
				+        :param key:
			
 
				+        :return:
			
 
				+        """
			
 
				+        password = key.encode()
			
 
				+        iv = password[:16]
			
 
				+        try:
			
 
				+            ct = base64.b64decode(data.encode())
			
 
				+            cipher = AES.new(password, AES.MODE_CBC, iv)
			
 
				+            pt = unpad(cipher.decrypt(ct), AES.block_size)
			
 
				+            return base64.b64decode(pt).decode()
			
 
				+        except Exception as e:
			
 
				+            print("Incorrect decryption {}".format(e))
			
 
				+            return None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_video_url(cls, text):
			
 
				+        """
			
 
				+        获取视频 video_url
			
 
				+        :param text:
			
 
				+        :return:
			
 
				+        """
			
 
				+        HTML = etree.HTML(text)
			
 
				+        str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
			
 
				+        json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
			
 
				+        Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
			
 
				+        # python中不规则的定义
			
 
				+        for I in Irregulars:
			
 
				+            if I in ['=false', '=true']:
			
 
				+                json_2 = json_2.replace(I, '=' + I[1:].capitalize())
			
 
				+            else:
			
 
				+                json_2 = json_2.replace(I, '12')
			
 
				+        dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]
			
 
				+        duration = dict_2["video_duration"]
			
 
				+        play_cnt = dict_2['video_watch_count']
			
 
				+        publish_time = int(dict_2['video_publish_time'])
			
 
				+        like_cnt = dict_2['video_like_count']
			
 
				+        video_title = dict_2['title']
			
 
				+        video_id = dict_2['vid']
			
 
				+        video_res = dict_2['videoResource']
			
 
				+        cover_url = dict_2['poster_url'].replace("\\u002F", "/")
			
 
				+        if video_res['dash'] == 12:
			
 
				+            obj = video_res['normal']
			
 
				+            ptk = obj['ptk']
			
 
				+            video_list = obj['video_list']
			
 
				+            keys = list(video_list.keys())
			
 
				+            main_url = video_list[keys[-1]]['main_url']
			
 
				+            real_video_url = cls.aes_decrypt(data=main_url, key=ptk)
			
 
				         else:
			
 
				-            json_2 = json_2.replace(I, '12')
			
 
				-    dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]
			
 
				-    if dict_2['dash'] == 12:
			
 
				-        obj = dict_2['normal']
			
 
				-        ptk = obj['ptk']
			
 
				-        main_url = obj['video_list']['video_3']['main_url']
			
 
				-        real_video_url = aes_decrypt(data=main_url, key=ptk)
			
 
				-    else:
			
 
				-        obj = dict_2['dash']
			
 
				-        ptk = obj["ptk"]
			
 
				-        video_url = obj['dynamic_video']['main_url']
			
 
				-        real_video_url = aes_decrypt(data=video_url, key=ptk)
			
 
				-    return real_video_url
			
 
				-
			
 
				-
			
 
				-def extract_info_by_re(text):
			
 
				-    """
			
 
				-    通过正则表达式获取文本中的信息
			
 
				-    :param text:
			
 
				-    :return:
			
 
				-    """
			
 
				-    # 标题
			
 
				-    title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
			
 
				-    if title_match:
			
 
				-        title_content = title_match.group(1)
			
 
				-        title_content = title_content.split(" - ")[0]
			
 
				-        title_content = bytes(title_content, "latin1").decode()
			
 
				-    else:
			
 
				-        title_content = ""
			
 
				-
			
 
				-    # video_id
			
 
				-    video_id = re.search(r'"vid":"(.*?)"', text).group(1)
			
 
				-
			
 
				-    # like_count
			
 
				-    like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
			
 
				-
			
 
				-    # cover_url
			
 
				-    cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
			
 
				-
			
 
				-    # video_play
			
 
				-    video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
			
 
				-
			
 
				-    # "video_publish_time"
			
 
				-    publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
			
 
				-
			
 
				-    # video_duration
			
 
				-    duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
			
 
				-
			
 
				-    return {
			
 
				-        "title": title_content,
			
 
				-        "url": extract_video_url(text),
			
 
				-        "video_id": video_id,
			
 
				-        "like_count": like_count,
			
 
				-        "cover_url": cover_url,
			
 
				-        "play_count": video_watch_count,
			
 
				-        "publish_time": publish_time,
			
 
				-        "duration": duration
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-def get_video_info(item_id):
			
 
				-    """
			
 
				-    获取视频信息
			
 
				-    """
			
 
				-    url = "https://www.ixigua.com/{}".format(item_id)
			
 
				-    headers = {
			
 
				-        "accept-encoding": "gzip, deflate",
			
 
				-        "accept-language": "zh-CN,zh-Hans;q=0.9",
			
 
				-        "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
			
 
				-        "user-agent": FakeUserAgent().random,
			
 
				-        "referer": "https://www.ixigua.com/{}/".format(item_id),
			
 
				-    }
			
 
				-    response = requests.get(
			
 
				-        url=url,
			
 
				-        headers=headers,
			
 
				-        # proxies=tunnel_proxies(),
			
 
				-        timeout=5,
			
 
				-    )
			
 
				-    time.sleep(random.randint(1, 5))
			
 
				-    video_info = extract_info_by_re(response.text)
			
 
				-
			
 
				-    video_dict = {
			
 
				-        "video_title": video_info.get("title", ""),
			
 
				-        "video_id": video_info.get("video_id"),
			
 
				-        "gid": str(item_id),
			
 
				-        "play_cnt": int(video_info.get("play_count", 0)),
			
 
				-        "like_cnt": int(video_info.get("like_count", 0)),
			
 
				-        "comment_cnt": 0,
			
 
				-        "share_cnt": 0,
			
 
				-        "favorite_cnt": 0,
			
 
				-        "duration": int(video_info.get("duration", 0)),
			
 
				-        "video_width": 0,
			
 
				-        "video_height": 0,
			
 
				-        "publish_time_stamp": int(video_info.get("publish_time", 0)),
			
 
				-        "publish_time_str": time.strftime(
			
 
				-            "%Y-%m-%d %H:%M:%S",
			
 
				-            time.localtime(int(video_info.get("publish_time", 0))),
			
 
				-        ),
			
 
				-        "avatar_url": str(
			
 
				-            video_info.get("user_info", {}).get("avatar_url", "")
			
 
				-        ),
			
 
				-        "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
			
 
				-        "video_url": video_info.get("url"),
			
 
				-        "session": f"xigua-author-{int(time.time())}",
			
 
				-    }
			
 
				-    return video_dict
			
 
				+            obj = video_res['dash']
			
 
				+            ptk = obj["ptk"]
			
 
				+            video_url = obj['dynamic_video']['main_url']
			
 
				+            real_video_url = cls.aes_decrypt(data=video_url, key=ptk)
			
 
				+        return {
			
 
				+            "video_url": real_video_url,
			
 
				+            "cover_url": cover_url,
			
 
				+            "video_id": video_id,
			
 
				+            "video_title": video_title,
			
 
				+            "like_cnt": like_cnt,
			
 
				+            "play_cnt": play_cnt,
			
 
				+            "publish_time": publish_time,
			
 
				+            "duration": duration
			
 
				+        }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_info_by_re(cls, text):
			
 
				+        """
			
 
				+        通过正则表达式获取文本中的信息
			
 
				+        :param text:
			
 
				+        :return:
			
 
				+        """
			
 
				+        result = cls.extract_video_url(text)
			
 
				+        # 标题
			
 
				+        title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
			
 
				+        if title_match:
			
 
				+            title_content = title_match.group(1)
			
 
				+            title_content = title_content.split(" - ")[0]
			
 
				+            try:
			
 
				+                title_content = bytes(title_content, "latin1").decode()
			
 
				+            except:
			
 
				+                title_content = title_content
			
 
				+        else:
			
 
				+            title_content = ""
			
 
				+        result['video_title'] = title_content
			
 
				+        return result
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_video_info(cls, item_id):
			
 
				+        """
			
 
				+        获取视频信息
			
 
				+        """
			
 
				+        url = "https://www.ixigua.com/{}".format(item_id)
			
 
				+        headers = {
			
 
				+            "accept-encoding": "gzip, deflate",
			
 
				+            "accept-language": "zh-CN,zh-Hans;q=0.9",
			
 
				+            "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
			
 
				+            "user-agent": FakeUserAgent().random,
			
 
				+            "referer": "https://www.ixigua.com/{}/".format(item_id),
			
 
				+        }
			
 
				+        response = requests.get(
			
 
				+            url=url,
			
 
				+            headers=headers
			
 
				+        )
			
 
				+        video_info = cls.extract_info_by_re(response.text)
			
 
				+        return video_info
			
 
				 
			
 
				 
			
 
				 def xigua_search(keyword):
			
 
				     """
			
 
				     搜索
			
 
				     """
			
 
				+    sensitive_words = MySQLServer().select_sensitive_words()
			
 
				+
			
 
				+    def sensitive_flag(s_words, ori_title):
			
 
				+        """
			
 
				+        :param ori_title:
			
 
				+        :param s_words:
			
 
				+        :return:
			
 
				+        """
			
 
				+        for word in s_words:
			
 
				+            if word in ori_title:
			
 
				+                return False
			
 
				+        return True
			
 
				+
			
 
				     keyword = urllib.parse.quote(keyword)
			
 
				     base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
			
 
				         keyword
			
@@ -191,13 +196,6 @@ def xigua_search(keyword):
 
				         "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
			
 
				         "cache-control": "max-age=0",
			
 
				         "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
			
 
				-        "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
			
 
				-        "sec-ch-ua-mobile": "?0",
			
 
				-        "sec-ch-ua-platform": '"macOS"',
			
 
				-        "sec-fetch-dest": "document",
			
 
				-        "sec-fetch-mode": "navigate",
			
 
				-        "sec-fetch-site": "none",
			
 
				-        "sec-fetch-user": "?1",
			
 
				         "upgrade-insecure-requests": "1",
			
 
				         "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
			
 
				     }
			
@@ -206,14 +204,20 @@ def xigua_search(keyword):
 
				     result = html.xpath(
			
 
				         '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
			
 
				     )
			
 
				-    res_list = []
			
 
				-    for page_id in result[:5]:
			
 
				-        doc_id = page_id[1:].split("?")[0]
			
 
				-        try:
			
 
				-            res = get_video_info(doc_id)
			
 
				-            temp = ["xigua", res['video_title'], res['video_url'], "https://www.ixigua.com/{}".format(doc_id)]
			
 
				-            res_list.append(temp)
			
 
				-        except:
			
 
				-            pass
			
 
				-    return res_list
			
 
				-
			
 
				+    if result:
			
 
				+        L = []
			
 
				+        doc_id_list = [page_id[1:] for page_id in result]
			
 
				+        for doc_id in doc_id_list:
			
 
				+            try:
			
 
				+                video_d = XiGuaFunctions().get_video_info(doc_id)
			
 
				+                video_title = video_d['video_title']
			
 
				+                if sensitive_flag(sensitive_words, video_title) and int(video_d['duration']) <= 300:
			
 
				+                    L.append(video_d)
			
 
				+                else:
			
 
				+                    continue
			
 
				+            except Exception as e:
			
 
				+                print(e)
			
 
				+                continue
			
 
				+        return L
			
 
				+    else:
			
 
				+        return []
			
--- a/applications/static/config.py
+++ b/applications/static/config.py
@@ -299,4 +299,29 @@ gh_id_dict = {
 
				         "uid": 69637480,
			
 
				         "nick_name": "风间"
			
 
				     }
			
 
				-}
			
 
				+}
			
 
				+
			
 
				+sensitive_words = [
			
 
				+    "台湾",
			
 
				+    "南海",
			
 
				+    "强奸",
			
 
				+    "寂寞难耐",
			
 
				+    "欲求不满",
			
 
				+    "不雅视频",
			
 
				+    "人妻",
			
 
				+    "侵犯",
			
 
				+    "正部级",
			
 
				+    "外长",
			
 
				+    "邓小平",
			
 
				+    "林彪",
			
 
				+    "李先念",
			
 
				+    "毛主席",
			
 
				+    "毛泽东",
			
 
				+    "江青",
			
 
				+    "朱镕基",
			
 
				+    "胡耀邦",
			
 
				+    "政治局",
			
 
				+    "省委书记",
			
 
				+    "国防部长",
			
 
				+    "外交部长"
			
 
				+]
			
--- a/dev/test.py
+++ b/dev/test.py
@@ -10,7 +10,7 @@ body = {
 
				     "content": "",
			
 
				     "cover": "",
			
 
				     "ghId": "gh_d2cc901deca7",
			
 
				-    "title": "江泽民"
			
 
				+    "title": "掘金大战森林狼"
			
 
				 }
			
 
				 a = time.time()
			
 
				 header = {
			
--- a/dev/test_search.py
+++ b/dev/test_search.py
@@ -0,0 +1,15 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from applications.search import *
			
 
				+
			
 
				+keys = "湖人大战勇士"
			
 
				+
			
 
				+wx_result = wx_search(keys)
			
 
				+print(wx_result)
			
 
				+
			
 
				+xg_result = xigua_search(keys)
			
 
				+print(xg_result)
			
 
				+
			
 
				+baidu_result = hksp_search(keys)
			
 
				+print(baidu_result)