1 vuosi sitten · eebeaef171
--- a/applications/functions/common.py
+++ b/applications/functions/common.py
@@ -3,6 +3,7 @@
 
															 """
														
 
															 import os
														
 
															 import json
														
 
															+import time
														
 
															 import uuid
														
 
															 import requests
														
 
															 import pymysql
														
@@ -17,21 +18,6 @@ class Functions(object):
 
															     通用工具代码
														
 
															     """
														
 
															-    # 敏感词逻辑
														
 
															-    @classmethod
														
 
															-    def sensitive_flag(cls, title):
														
 
															-        """
														
 
															-        判断标题是否命中过滤词
														
 
															-        :param title:
														
 
															-        :return:
														
 
															-        """
														
 
															-        sensitive_words = MySQLServer().select_sensitive_words()
														
 
															-        for word in sensitive_words:
														
 
															-            if word in title:
														
 
															-                # title = title.replace(word, "*")
														
 
															-                return False
														
 
															-        return True
														
 
															-
														
 
															     # 自动加入白名单逻辑
														
 
															     @classmethod
														
 
															     def auto_white(cls, root_share_id):
														
@@ -155,7 +141,8 @@ class MySQLServer(object):
 
															         :param trace_id:
														
 
															         :return:
														
 
															         """
														
 
															-        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id, trace_id)
														
 
															+        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id,
														
 
															+                                                                                                           trace_id)
														
 
															         connection = pymysql.connect(
														
 
															             host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址，内网地址
														
 
															             port=3306,  # 端口号
														
@@ -168,20 +155,12 @@ class MySQLServer(object):
 
															         cursor.execute(sql)
														
 
															         out_video_list = cursor.fetchall()
														
 
															         if len(out_video_list) > 0:
														
 
															-            vid_list = [i[0] for i in out_video_list if i[0] != 0]
														
 
															-            vid_list = [vid_list[0]]
														
 
															-            # dir_path = os.path.join(os.getcwd(), 'applications', 'static', "out_videos")
														
 
															-            # os.makedirs(os.path.dirname(dir_path), exist_ok=True)
														
 
															-            # done_list = os.listdir(dir_path)
														
 
															-            # process_list = [
														
 
															-            #     (
														
 
															-            #         i[1],
														
 
															-            #         trace_id,
														
 
															-            #         os.path.join(dir_path, "{}.json".format(i[0]))
														
 
															-            #     ) for i in out_video_list if not "{}.json".format(i[0]) in done_list
														
 
															-            # ]
														
 
															-            # if process_list:
														
 
															-            #     ask_kimi_and_save_to_local(process_list[0])
														
 
															+            if out_video_list[0][0] == 0:
														
 
															+                video_id = cls.search_id_to_video(trace_id)
														
 
															+            else:
														
 
															+                video_id = out_video_list[0][0]
														
 
															+
														
 
															+            vid_list = [video_id]
														
 
															             logging(
														
 
															                 code="2003",
														
 
															                 trace_id=trace_id,
														
@@ -252,6 +231,32 @@ class MySQLServer(object):
 
															         result = [line[0] for line in data]
														
 
															         return result
														
 
															+    @classmethod
														
 
															+    def search_id_to_video(cls, trace_id):
														
 
															+        """
														
 
															+        通过 search_id 返回 video_id
														
 
															+        :param trace_id:
														
 
															+        :return:
														
 
															+        """
														
 
															+        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id,
														
 
															+                                                                                                           trace_id)
														
 
															+        connection = pymysql.connect(
														
 
															+            host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址，内网地址
														
 
															+            port=3306,  # 端口号
														
 
															+            user="crawler",  # mysql用户名
														
 
															+            passwd="crawler123456@",  # mysql用户登录密码
														
 
															+            db="piaoquan-crawler",  # 数据库名
														
 
															+            charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的，charset指定是utf8
														
 
															+        )
														
 
															+        cursor = connection.cursor()
														
 
															+        cursor.execute(sql)
														
 
															+        out_video_list = cursor.fetchall()
														
 
															+        if int(out_video_list[0][0]) == 0:
														
 
															+            time.sleep(1)
														
 
															+            return cls.search_id_to_video(trace_id)
														
 
															+        else:
														
 
															+            return out_video_list[0][0]
														
 
															+
														
 
															 class KimiServer(object):
														
 
															     """
														
@@ -364,4 +369,4 @@ class KimiServer(object):
 
															             model="moonshot-v1-8k",
														
 
															         )
														
 
															         response = chat_completion.choices[0].message.content
														
 
															-        return response
														
 
															+        return response
														
--- a/applications/functions/item.py
+++ b/applications/functions/item.py
@@ -1,98 +0,0 @@
 
															-"""
														
 
															-@author: luojunhui
														
 
															-"""
														
 
															-import time
														
 
															-
														
 
															-from applications.functions.common import Functions
														
 
															-
														
 
															-
														
 
															-class VideoItem(object):
														
 
															-    """
														
 
															-    function: 当扫描进一条视频的时候，对该视频的基本信息进行处理，保证发送给 pipeline和 etl 的 video_dict 是正确的
														
 
															-    __init__: 初始化空json 对象，用来存储视频信息
														
 
															-    add_video_info: 把视频信息存储到 item 对象中
														
 
															-    check_item: 检查 item 对象中的各个元素以及处理
														
 
															-    """
														
 
															-
														
 
															-    def __init__(self):
														
 
															-        self.item = {}
														
 
															-
														
 
															-    def add_video_info(self, key, value):
														
 
															-        self.item[key] = value
														
 
															-
														
 
															-    def check_item(self):
														
 
															-        """
														
 
															-        判断item 里面的字段，是否符合要求
														
 
															-        字段分为 3 类：
														
 
															-        1. 必须存在数据的字段： ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
														
 
															-        2. 不存在默认为 0 的字段 ：["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
														
 
															-        3. 需要后出理的字段： video_title, publish_time
														
 
															-        """
														
 
															-        if self.item.get("video_title"):
														
 
															-            self.item["video_title"] = Functions().clean_title(self.item["video_title"])
														
 
															-        else:
														
 
															-            return False
														
 
															-        if self.item.get("publish_time_stamp"):
														
 
															-            publish_time_str = time.strftime(
														
 
															-                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
														
 
															-            )
														
 
															-            self.add_video_info("publish_time_str", publish_time_str)
														
 
															-        else:
														
 
															-            publish_time_stamp = int(time.time())
														
 
															-            publish_time_str = time.strftime(
														
 
															-                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
														
 
															-            )
														
 
															-            self.add_video_info("publish_time_stamp", publish_time_stamp)
														
 
															-            self.add_video_info("publish_time_str", publish_time_str)
														
 
															-        self.add_video_info("publish_time", publish_time_str)
														
 
															-        if not self.item.get("update_time_stamp"):
														
 
															-            self.add_video_info("update_time_stamp", int(time.time()))
														
 
															-
														
 
															-        # 如果不存在，默认值为 0
														
 
															-        config_keys = [
														
 
															-            "duration",
														
 
															-            "play_cnt",
														
 
															-            "like_cnt",
														
 
															-            "comment_cnt",
														
 
															-            "share_cnt",
														
 
															-            "width",
														
 
															-            "height",
														
 
															-        ]
														
 
															-        for config_key in config_keys:
														
 
															-            if self.item.get(config_key):
														
 
															-                continue
														
 
															-            else:
														
 
															-                self.add_video_info(config_key, 0)
														
 
															-
														
 
															-        # 必须存在的元素，若不存在则会报错
														
 
															-        must_keys = [
														
 
															-            "video_id",
														
 
															-            "user_id",
														
 
															-            "user_name",
														
 
															-            "out_video_id",
														
 
															-            "session",
														
 
															-            "video_url",
														
 
															-            "cover_url",
														
 
															-            "platform",
														
 
															-            "strategy",
														
 
															-        ]
														
 
															-        """
														
 
															-        video_id, out_video_id 均为站外视频 id
														
 
															-        usr_id: 站内用户 id
														
 
															-        out_user_id: 站外用户 id
														
 
															-        user_name: 站外用户名称
														
 
															-        """
														
 
															-        for m_key in must_keys:
														
 
															-            if self.item.get(m_key):
														
 
															-                continue
														
 
															-            else:
														
 
															-                # print(m_key)
														
 
															-                return False
														
 
															-        return True
														
 
															-
														
 
															-    def produce_item(self):
														
 
															-        flag = self.check_item()
														
 
															-        if flag:
														
 
															-            return self.item
														
 
															-        else:
														
 
															-            return False
														
--- a/applications/functions/video_item.py
+++ b/applications/functions/video_item.py
@@ -0,0 +1,244 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+import time
														
 
															+from applications.functions.mq import MQ
														
 
															+from applications.functions.log import logging
														
 
															+from applications.functions.common import Functions
														
 
															+
														
 
															+
														
 
															+class VideoItem(object):
														
 
															+    """
														
 
															+    function: 当扫描进一条视频的时候，对该视频的基本信息进行处理，保证发送给 pipeline和 etl 的 video_dict 是正确的
														
 
															+    __init__: 初始化空json 对象，用来存储视频信息
														
 
															+    add_video_info: 把视频信息存储到 item 对象中
														
 
															+    check_item: 检查 item 对象中的各个元素以及处理
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.item = {}
														
 
															+
														
 
															+    def add_video_info(self, key, value):
														
 
															+        """
														
 
															+        insert or update video info
														
 
															+        :param key:
														
 
															+        :param value:
														
 
															+        """
														
 
															+        self.item[key] = value
														
 
															+
														
 
															+    def check_item(self):
														
 
															+        """
														
 
															+        判断item 里面的字段，是否符合要求
														
 
															+        字段分为 3 类：
														
 
															+        1. 必须存在数据的字段： ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
														
 
															+        2. 不存在默认为 0 的字段 ：["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
														
 
															+        3. 需要后出理的字段： video_title, publish_time
														
 
															+        """
														
 
															+        if self.item.get("video_title"):
														
 
															+            self.item["video_title"] = Functions().clean_title(self.item["video_title"])
														
 
															+        else:
														
 
															+            return False
														
 
															+        if self.item.get("publish_time_stamp"):
														
 
															+            publish_time_str = time.strftime(
														
 
															+                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
														
 
															+            )
														
 
															+            self.add_video_info("publish_time_str", publish_time_str)
														
 
															+        else:
														
 
															+            publish_time_stamp = int(time.time())
														
 
															+            publish_time_str = time.strftime(
														
 
															+                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
														
 
															+            )
														
 
															+            self.add_video_info("publish_time_stamp", publish_time_stamp)
														
 
															+            self.add_video_info("publish_time_str", publish_time_str)
														
 
															+        self.add_video_info("publish_time", publish_time_str)
														
 
															+        if not self.item.get("update_time_stamp"):
														
 
															+            self.add_video_info("update_time_stamp", int(time.time()))
														
 
															+
														
 
															+        # 如果不存在，默认值为 0
														
 
															+        config_keys = [
														
 
															+            "duration",
														
 
															+            "play_cnt",
														
 
															+            "like_cnt",
														
 
															+            "comment_cnt",
														
 
															+            "share_cnt",
														
 
															+            "width",
														
 
															+            "height",
														
 
															+        ]
														
 
															+        for config_key in config_keys:
														
 
															+            if self.item.get(config_key):
														
 
															+                continue
														
 
															+            else:
														
 
															+                self.add_video_info(config_key, 0)
														
 
															+
														
 
															+        # 必须存在的元素，若不存在则会报错
														
 
															+        must_keys = [
														
 
															+            "video_id",
														
 
															+            "user_id",
														
 
															+            "user_name",
														
 
															+            "out_video_id",
														
 
															+            "session",
														
 
															+            "video_url",
														
 
															+            "cover_url",
														
 
															+            "platform",
														
 
															+            "strategy",
														
 
															+        ]
														
 
															+        """
														
 
															+        video_id, out_video_id 均为站外视频 id
														
 
															+        usr_id: 站内用户 id
														
 
															+        out_user_id: 站外用户 id
														
 
															+        user_name: 站外用户名称
														
 
															+        """
														
 
															+        for m_key in must_keys:
														
 
															+            if self.item.get(m_key):
														
 
															+                continue
														
 
															+            else:
														
 
															+                # print(m_key)
														
 
															+                return False
														
 
															+        return True
														
 
															+
														
 
															+    def produce_item(self):
														
 
															+        """
														
 
															+        item producer
														
 
															+        :return:
														
 
															+        """
														
 
															+        flag = self.check_item()
														
 
															+        if flag:
														
 
															+            return self.item
														
 
															+        else:
														
 
															+            return False
														
 
															+
														
 
															+
														
 
															+class VideoProducer(object):
														
 
															+    """
														
 
															+    处理视频
														
 
															+    todo: baidu && xigua video process
														
 
															+    """
														
 
															+
														
 
															+    @classmethod
														
 
															+    def wx_video_producer(cls, video_obj, user, trace_id):
														
 
															+        """
														
 
															+            异步处理微信 video_obj
														
 
															+            公众号和站内账号一一对应
														
 
															+            :param trace_id:
														
 
															+            :param user:
														
 
															+            :param video_obj:
														
 
															+            :return:
														
 
															+        """
														
 
															+        platform = "weixin_search"
														
 
															+        publish_time_stamp = int(video_obj['pubTime'])
														
 
															+        item = VideoItem()
														
 
															+        item.add_video_info("user_id", user["uid"])
														
 
															+        item.add_video_info("user_name", user["nick_name"])
														
 
															+        item.add_video_info("video_id", video_obj['hashDocID'])
														
 
															+        item.add_video_info("video_title", trace_id)
														
 
															+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
														
 
															+        item.add_video_info("video_url", video_obj["videoUrl"])
														
 
															+        item.add_video_info("cover_url", video_obj["image"])
														
 
															+        item.add_video_info("out_video_id", video_obj['hashDocID'])
														
 
															+        item.add_video_info("out_user_id", trace_id)
														
 
															+        item.add_video_info("platform", platform)
														
 
															+        item.add_video_info("strategy", "search")
														
 
															+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
														
 
															+        mq_obj = item.produce_item()
														
 
															+        return mq_obj
														
 
															+
														
 
															+    @classmethod
														
 
															+    def baidu_video_producer(cls, video_obj, user, trace_id):
														
 
															+        """
														
 
															+        处理好看视频的 video_info
														
 
															+        :param video_obj:
														
 
															+        :param user:
														
 
															+        :param trace_id:
														
 
															+        :return:
														
 
															+        """
														
 
															+        platform = "baidu_search"
														
 
															+        publish_time_stamp = int(video_obj['publish_time'])
														
 
															+        item = VideoItem()
														
 
															+        item.add_video_info("user_id", user["uid"])
														
 
															+        item.add_video_info("user_name", user["nick_name"])
														
 
															+        item.add_video_info("video_id", video_obj['id'])
														
 
															+        item.add_video_info("video_title", trace_id)
														
 
															+        item.add_video_info("publish_time_stamp", publish_time_stamp)
														
 
															+        item.add_video_info("video_url", video_obj["playurl"])
														
 
															+        item.add_video_info("cover_url", video_obj["poster"])
														
 
															+        item.add_video_info("out_video_id", video_obj['id'])
														
 
															+        item.add_video_info("out_user_id", trace_id)
														
 
															+        item.add_video_info("like_cnt", video_obj['like'])
														
 
															+        item.add_video_info("play_cnt", video_obj['playcnt'])
														
 
															+        item.add_video_info("duration", video_obj['duration'])
														
 
															+        item.add_video_info("platform", platform)
														
 
															+        item.add_video_info("strategy", "search")
														
 
															+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
														
 
															+        mq_obj = item.produce_item()
														
 
															+        return mq_obj
														
 
															+
														
 
															+    @classmethod
														
 
															+    def xg_video_producer(cls, video_obj, user, trace_id):
														
 
															+        """
														
 
															+        西瓜搜索
														
 
															+        :param video_obj:
														
 
															+        :param user:
														
 
															+        :param trace_id:
														
 
															+        :return:
														
 
															+        """
														
 
															+        platform = "xg_search"
														
 
															+        publish_time_stamp = int(video_obj['publish_time'])
														
 
															+        item = VideoItem()
														
 
															+        item.add_video_info("user_id", user["uid"])
														
 
															+        item.add_video_info("user_name", user["nick_name"])
														
 
															+        item.add_video_info("video_id", video_obj['video_id'])
														
 
															+        item.add_video_info("video_title", trace_id)
														
 
															+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
														
 
															+        item.add_video_info("video_url", video_obj["video_url"])
														
 
															+        item.add_video_info("cover_url", video_obj["cover_url"])
														
 
															+        item.add_video_info("out_video_id", video_obj['video_id'])
														
 
															+        item.add_video_info("play_cnt", video_obj['play_cnt'])
														
 
															+        item.add_video_info("duration", video_obj['duration'])
														
 
															+        item.add_video_info("like_cnt", video_obj['like_cnt'])
														
 
															+        item.add_video_info("out_user_id", trace_id)
														
 
															+        item.add_video_info("platform", platform)
														
 
															+        item.add_video_info("strategy", "search")
														
 
															+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
														
 
															+        mq_obj = item.produce_item()
														
 
															+        return mq_obj
														
 
															+
														
 
															+
														
 
															+def video_mq_sender(video_obj, user, trace_id, platform):
														
 
															+    """
														
 
															+    异步处理微信 video_obj
														
 
															+    公众号和站内账号一一对应
														
 
															+    :param platform:
														
 
															+    :param user:
														
 
															+    :param trace_id:
														
 
															+    :param video_obj:
														
 
															+    :return:
														
 
															+    """
														
 
															+    ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
														
 
															+    Video = VideoProducer()
														
 
															+    if platform == "xg_search":
														
 
															+        mq_obj = Video.xg_video_producer(
														
 
															+            video_obj=video_obj,
														
 
															+            user=user,
														
 
															+            trace_id=trace_id,
														
 
															+        )
														
 
															+    elif platform == "baidu_search":
														
 
															+        mq_obj = Video.baidu_video_producer(
														
 
															+            video_obj=video_obj,
														
 
															+            user=user,
														
 
															+            trace_id=trace_id,
														
 
															+        )
														
 
															+    elif platform == "wx_search":
														
 
															+        mq_obj = Video.wx_video_producer(
														
 
															+            video_obj=video_obj,
														
 
															+            user=user,
														
 
															+            trace_id=trace_id,
														
 
															+        )
														
 
															+    else:
														
 
															+        mq_obj = {}
														
 
															+    ETL_MQ.send_msg(params=mq_obj)
														
 
															+    logging(
														
 
															+        code="6002",
														
 
															+        info="发送消息至 ETL",
														
 
															+        data=mq_obj,
														
 
															+        trace_id=trace_id
														
 
															+    )
														
--- a/applications/routes.py
+++ b/applications/routes.py
@@ -35,7 +35,7 @@ async def search_videos_from_the_web():
 
															     :return:
														
 
															     """
														
 
															     params = await request.get_json()
														
 
															-    title = params['title']
														
 
															+    title = params['title'].replace("【非头次】", "")
														
 
															     gh_id = params['ghId']
														
 
															     trace_id = "search-{}-{}".format(str(uuid.uuid4()), str(int(time.time())))
														
 
															     params['trace_id'] = trace_id
														
@@ -46,36 +46,36 @@ async def search_videos_from_the_web():
 
															         function="search_videos_from_the_web",
														
 
															         trace_id=trace_id
														
 
															     )
														
 
															-    try:
														
 
															-        title_p = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
														
 
															-        if os.path.exists(title_p):
														
 
															-            logging(
														
 
															-                code="2001",
														
 
															-                info="该标题已经被 kimi 处理过，跳过请求 kimi 操作--- {}".format(title),
														
 
															-                function="search_videos_from_the_web",
														
 
															-                trace_id=trace_id
														
 
															-            )
														
 
															-        else:
														
 
															-            KimiServer().ask_kimi_and_save_to_local((title, trace_id, title_p))
														
 
															-        await asyncio.sleep(1)
														
 
															-        kimi_title = KimiServer().kimi_title(title)
														
 
															-        search_videos(
														
 
															-            title=title,
														
 
															-            video_path=title_p,
														
 
															-            trace_id=trace_id,
														
 
															-            gh_id=gh_id,
														
 
															+    # try:
														
 
															+    title_p = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
														
 
															+    if os.path.exists(title_p):
														
 
															+        logging(
														
 
															+            code="2001",
														
 
															+            info="该标题已经被 kimi 处理过，跳过请求 kimi 操作--- {}".format(title),
														
 
															+            function="search_videos_from_the_web",
														
 
															+            trace_id=trace_id
														
 
															         )
														
 
															-        res = {
														
 
															-            "trace_id": trace_id,
														
 
															-            "code": 0,
														
 
															-            "kimi_title": kimi_title
														
 
															-        }
														
 
															-    except Exception as e:
														
 
															-        res = {
														
 
															-            "trace_id": trace_id,
														
 
															-            "code": 1,
														
 
															-            "message": str(e)
														
 
															-        }
														
 
															+    else:
														
 
															+        KimiServer().ask_kimi_and_save_to_local((title, trace_id, title_p))
														
 
															+    await asyncio.sleep(1)
														
 
															+    kimi_title = KimiServer().kimi_title(title)
														
 
															+    search_videos(
														
 
															+        title=title,
														
 
															+        video_path=title_p,
														
 
															+        trace_id=trace_id,
														
 
															+        gh_id=gh_id,
														
 
															+    )
														
 
															+    res = {
														
 
															+        "trace_id": trace_id,
														
 
															+        "code": 0,
														
 
															+        "kimi_title": kimi_title
														
 
															+    }
														
 
															+    # except Exception as e:
														
 
															+    #     res = {
														
 
															+    #         "trace_id": trace_id,
														
 
															+    #         "code": 1,
														
 
															+    #         "message": str(e)
														
 
															+    #     }
														
 
															     return jsonify(res)
														
--- a/applications/schedule/process_schedule.py
+++ b/applications/schedule/process_schedule.py
@@ -3,7 +3,6 @@
 
															 对请求进行操作
														
 
															 """
														
 
															 import json
														
 
															-import time
														
 
															 import os
														
 
															 from applications.match_alg import best_choice
														
--- a/applications/schedule/search_schedule.py
+++ b/applications/schedule/search_schedule.py
@@ -3,69 +3,14 @@
 
															 调用接口在微信内搜索视频
														
 
															 """
														
 
															 import json
														
 
															-import time
														
 
															-import requests
														
 
															-from applications.functions.mq import MQ
														
 
															-from applications.functions.log import logging
														
 
															+from applications.search import *
														
 
															 from applications.static.config import gh_id_dict
														
 
															-from applications.functions.item import VideoItem
														
 
															-
														
 
															-
														
 
															-def wx_search(keys):
														
 
															-    """
														
 
															-    WeChat search
														
 
															-    :param keys:
														
 
															-    :return:
														
 
															-    """
														
 
															-    url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
														
 
															-    payload = json.dumps({
														
 
															-        "keyword": keys,
														
 
															-        "cursor": "0",
														
 
															-        "content_type": "video"
														
 
															-    })
														
 
															-    headers = {
														
 
															-        'Content-Type': 'application/json'
														
 
															-    }
														
 
															-    response = requests.request("POST", url, headers=headers, data=payload)
														
 
															-    return response.json()
														
 
															-
														
 
															-
														
 
															-def process_weixin_video_obj(video_obj, user, trace_id):
														
 
															-    """
														
 
															-    异步处理微信 video_obj
														
 
															-    公众号和站内账号一一对应
														
 
															-    :param trace_id:
														
 
															-    :param user:
														
 
															-    :param video_obj:
														
 
															-    :return:
														
 
															-    """
														
 
															-    ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
														
 
															-    platform = "weixin_search"
														
 
															-    publish_time_stamp = int(video_obj['pubTime'])
														
 
															-    item = VideoItem()
														
 
															-    item.add_video_info("user_id", user["uid"])
														
 
															-    item.add_video_info("user_name", user["nick_name"])
														
 
															-    item.add_video_info("video_id", video_obj['hashDocID'])
														
 
															-    item.add_video_info("video_title", trace_id)
														
 
															-    item.add_video_info("publish_time_stamp", int(publish_time_stamp))
														
 
															-    item.add_video_info("video_url", video_obj["videoUrl"])
														
 
															-    item.add_video_info("cover_url", video_obj["image"])
														
 
															-    item.add_video_info("out_video_id", video_obj['hashDocID'])
														
 
															-    item.add_video_info("out_user_id", trace_id)
														
 
															-    item.add_video_info("platform", platform)
														
 
															-    item.add_video_info("strategy", "search")
														
 
															-    item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
														
 
															-    mq_obj = item.produce_item()
														
 
															-    ETL_MQ.send_msg(params=mq_obj)
														
 
															-    logging(
														
 
															-        code="6002",
														
 
															-        info="发送消息至 ETL",
														
 
															-        data=mq_obj
														
 
															-    )
														
 
															+from applications.functions.log import logging
														
 
															+from applications.functions.video_item import video_mq_sender
														
 
															-def return_video(video_path, title, trace_id):
														
 
															+def recall_search_video(video_path, title, trace_id):
														
 
															     """
														
 
															     search and send msg to ETL
														
 
															     :param trace_id:
														
@@ -76,81 +21,45 @@ def return_video(video_path, title, trace_id):
 
															     with open(video_path, encoding='utf-8') as f:
														
 
															         my_obj = json.loads(f.read())
														
 
															     if my_obj:
														
 
															-        # 三者都搜索，优先搜索 title
														
 
															-        title_result = wx_search(keys=title)
														
 
															-        if title_result['msg'] == '未知错误':
														
 
															-            logging(
														
 
															-                code="7001",
														
 
															-                info="通过标题搜索失败---{}".format(title),
														
 
															-                trace_id=trace_id
														
 
															-            )
														
 
															-        else:
														
 
															-            obj_list = title_result['data']['data']
														
 
															-            if obj_list:
														
 
															-                return obj_list[0]
														
 
															-            # for obj in obj_list:
														
 
															-            #     try:
														
 
															-            #         title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
														
 
															-            #                                                                                      '').replace("#",
														
 
															-            #                                                                                             "")
														
 
															-            #         if Functions().sensitive_flag(title):
														
 
															-            #             return obj
														
 
															-            #         else:
														
 
															-            #             continue
														
 
															-            #     except Exception as e:
														
 
															-            #         print(e)
														
 
															-            #         continue
														
 
															-
														
 
															-        # # search_keys
														
 
															-        search_keys_result = wx_search(keys=my_obj['search_keys'][0])
														
 
															-        if search_keys_result['msg'] == '未知错误':
														
 
															-            logging(
														
 
															-                code="7001",
														
 
															-                info="通过搜索词搜索失败---{}".format(title),
														
 
															-                trace_id=trace_id
														
 
															-            )
														
 
															+        wx_result = wx_search(keys=title)
														
 
															+        if wx_result:
														
 
															+            return {
														
 
															+                "platform": "wx_search",
														
 
															+                "result": wx_result[0]
														
 
															+            }
														
 
															         else:
														
 
															-            obj_list = search_keys_result['data']['data']
														
 
															-            if obj_list:
														
 
															-                return obj_list[0]
														
 
															-        #     for obj in obj_list:
														
 
															-        #         try:
														
 
															-        #             title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
														
 
															-        #                                                                                              '').replace("#",
														
 
															-        #                                                                                                          "")
														
 
															-        #             if Functions().sensitive_flag(title):
														
 
															-        #                 return obj
														
 
															-        #             else:
														
 
															-        #                 continue
														
 
															-        #         except Exception as e:
														
 
															-        #             print(e)
														
 
															-        #             continue
														
 
															-
														
 
															-        # theme
														
 
															-        theme_result = wx_search(keys=my_obj['theme'])
														
 
															-        if theme_result['msg'] == '未知错误':
														
 
															             logging(
														
 
															                 code="7001",
														
 
															-                info="通过主题搜索失败---{}".format(title),
														
 
															+                info="通过微信搜索失败---{}".format(title),
														
 
															                 trace_id=trace_id
														
 
															             )
														
 
															-        else:
														
 
															-            obj_list = theme_result['data']['data']
														
 
															-            if obj_list:
														
 
															-                return obj_list[0]
														
 
															-            # for obj in obj_list:
														
 
															-            #     try:
														
 
															-            #         title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
														
 
															-            #                                                                                          '').replace("#",
														
 
															-            #                                                                                                      "")
														
 
															-            #         if Functions().sensitive_flag(title):
														
 
															-            #             return obj
														
 
															-            #         else:
														
 
															-            #             continue
														
 
															-            #     except Exception as e:
														
 
															-            #         print(e)
														
 
															-            #         continue
														
 
															-        return None
														
 
															+            # 微信搜不到的话，采用好看视频搜索
														
 
															+            baidu_result = hksp_search(key=title)
														
 
															+            if baidu_result:
														
 
															+                return {
														
 
															+                    "platform": "baidu_search",
														
 
															+                    "result": baidu_result[0]
														
 
															+                }
														
 
															+            else:
														
 
															+                # 若好看视频未搜到，则采用西瓜搜索
														
 
															+                logging(
														
 
															+                    code="7001",
														
 
															+                    info="通过baidu搜索失败---{}".format(title),
														
 
															+                    trace_id=trace_id
														
 
															+                )
														
 
															+                xigua_result = xigua_search(title)
														
 
															+                if xigua_result:
														
 
															+                    return {
														
 
															+                        "platform": "xg_search",
														
 
															+                        "result": xigua_result[0]
														
 
															+                    }
														
 
															+                else:
														
 
															+                    logging(
														
 
															+                        code="7001",
														
 
															+                        info="通过西瓜搜索失败---{}".format(title),
														
 
															+                        trace_id=trace_id
														
 
															+                    )
														
 
															+                    return None
														
 
															     else:
														
 
															         logging(
														
 
															             code="7000",
														
@@ -169,18 +78,21 @@ def search_videos(video_path, title, trace_id, gh_id):
 
															     :param trace_id:
														
 
															     :return:
														
 
															     """
														
 
															-    video_obj = return_video(video_path, title, trace_id)
														
 
															-    if video_obj:
														
 
															+    recall_obj = recall_search_video(video_path, title, trace_id)
														
 
															+    platform = recall_obj["platform"]
														
 
															+    recall_video = recall_obj["result"]
														
 
															+    if recall_video:
														
 
															         logging(
														
 
															             code="7002",
														
 
															-            info="视频搜索成功",
														
 
															+            info="视频搜索成功, 搜索平台为--{}".format(platform),
														
 
															             trace_id=trace_id,
														
 
															-            data=video_obj
														
 
															+            data=recall_video
														
 
															         )
														
 
															-        process_weixin_video_obj(
														
 
															-            video_obj=video_obj['items'][0],
														
 
															+        video_mq_sender(
														
 
															+            video_obj=recall_video,
														
 
															             user=gh_id_dict.get(gh_id),
														
 
															-            trace_id=trace_id
														
 
															+            trace_id=trace_id,
														
 
															+            platform=platform
														
 
															         )
														
 
															     else:
														
 
															         logging(
														
--- a/applications/search/hksp_search.py
+++ b/applications/search/hksp_search.py
@@ -7,6 +7,8 @@ import urllib.parse
 
															 import time
														
 
															 import hashlib
														
 
															+from applications.functions.common import MySQLServer
														
 
															+
														
 
															 def get_video_detail(video_id):
														
 
															     """
														
@@ -35,7 +37,6 @@ def get_video_detail(video_id):
 
															         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
														
 
															     }
														
 
															     response = requests.request("GET", url, headers=headers, params=params).json()
														
 
															-    # print(json.dumps(response['data']['apiData']['curVideoMeta'], ensure_ascii=False, indent=4))
														
 
															     return response['data']['apiData']['curVideoMeta']
														
@@ -43,6 +44,19 @@ def hksp_search(key):
 
															     """
														
 
															     好看视频搜索爬虫
														
 
															     """
														
 
															+    sensitive_words = MySQLServer().select_sensitive_words()
														
 
															+
														
 
															+    def sensitive_flag(s_words, ori_title):
														
 
															+        """
														
 
															+        :param ori_title:
														
 
															+        :param s_words:
														
 
															+        :return:
														
 
															+        """
														
 
															+        for word in s_words:
														
 
															+            if word in ori_title:
														
 
															+                return False
														
 
															+        return True
														
 
															+
														
 
															     timestamp_seconds = time.time()
														
 
															     timestamp_milliseconds = int(timestamp_seconds * 1000)
														
 
															     url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
														
@@ -63,32 +77,23 @@ def hksp_search(key):
 
															         'authority': 'haokan.baidu.com',
														
 
															         'accept': '*/*',
														
 
															         'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
														
 
															-        'cookie': "BIDUPSID='",
														
 
															-        # 'referer': 'https://haokan.baidu.com/web/search/page?query=%E8%80%81%E4%BA%BA',
														
 
															-        'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
														
 
															-        'sec-ch-ua-mobile': '?0',
														
 
															-        'sec-ch-ua-platform': '"macOS"',
														
 
															-        'sec-fetch-dest': 'empty',
														
 
															-        'sec-fetch-mode': 'cors',
														
 
															-        'sec-fetch-site': 'same-origin',
														
 
															+        'cookie': "BIDUPSID=",
														
 
															         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
														
 
															         'x-requested-with': 'xmlhttprequest',
														
 
															     }
														
 
															     # 发送GET请求
														
 
															     response = requests.get(url, headers=headers, params=params).json()
														
 
															-    # print(json.dumps(response, ensure_ascii=False, indent=4))
														
 
															     data_list = response['data']['list']
														
 
															+
														
 
															     L = []
														
 
															-    for data in data_list[:5]:
														
 
															+    for data in data_list:
														
 
															         try:
														
 
															             video_id = data['vid']
														
 
															             res = get_video_detail(video_id)
														
 
															-            temp = ["haokanshipin", res['title'], res['playurl'], "https://haokan.baidu.com/v?vid={}".format(video_id)]
														
 
															-            L.append(temp)
														
 
															+            if sensitive_flag(sensitive_words, ['title']) and int(res['duration']) <= 300:
														
 
															+                L.append(res)
														
 
															+            else:
														
 
															+                continue
														
 
															         except:
														
 
															             pass
														
 
															     return L
														
 
															-
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    hksp_search("美国竟对中国提出4个荒唐的条件，真是好大的口气")
														
--- a/applications/search/weixin_search.py
+++ b/applications/search/weixin_search.py
@@ -4,6 +4,8 @@
 
															 import json
														
 
															 import requests
														
 
															+from applications.functions.common import MySQLServer
														
 
															+
														
 
															 def wx_search(keys):
														
 
															     """
														
@@ -11,6 +13,20 @@ def wx_search(keys):
 
															     :param keys:
														
 
															     :return:
														
 
															     """
														
 
															+
														
 
															+    sensitive_words = MySQLServer().select_sensitive_words()
														
 
															+
														
 
															+    def sensitive_flag(s_words, ori_title):
														
 
															+        """
														
 
															+        :param ori_title:
														
 
															+        :param s_words:
														
 
															+        :return:
														
 
															+        """
														
 
															+        for word in s_words:
														
 
															+            if word in ori_title:
														
 
															+                return False
														
 
															+        return True
														
 
															+
														
 
															     url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
														
 
															     payload = json.dumps({
														
 
															         "keyword": keys,
														
@@ -20,5 +36,23 @@ def wx_search(keys):
 
															     headers = {
														
 
															         'Content-Type': 'application/json'
														
 
															     }
														
 
															-    response = requests.request("POST", url, headers=headers, data=payload)
														
 
															-    return response.json()
														
 
															+    response = requests.request("POST", url, headers=headers, data=payload).json()
														
 
															+    if response['msg'] == '未知错误':
														
 
															+        return []
														
 
															+    else:
														
 
															+        L = []
														
 
															+        if response['data']:
														
 
															+            video_list = response['data']['data']
														
 
															+            for video in video_list:
														
 
															+                try:
														
 
															+                    video_info = video['items'][0]
														
 
															+                    title = video_info['title']
														
 
															+                    duration_str = video_info['duration']
														
 
															+                    dr = int(duration_str.split(":")[0].strip()) + int(duration_str.split(":")[1].strip())
														
 
															+                    if sensitive_flag(sensitive_words, title) and dr <= 300:
														
 
															+                        L.append(video_info)
														
 
															+                    else:
														
 
															+                        continue
														
 
															+                except:
														
 
															+                    pass
														
 
															+        return L
														
--- a/applications/search/xigua_search.py
+++ b/applications/search/xigua_search.py
@@ -4,183 +4,188 @@
 
															 """
														
 
															 import re
														
 
															 import json
														
 
															-import time
														
 
															-import random
														
 
															 import base64
														
 
															+import requests
														
 
															 import urllib.parse
														
 
															-import requests
														
 
															 from lxml import etree
														
 
															 from Crypto.Cipher import AES
														
 
															 from Crypto.Util.Padding import unpad
														
 
															 from fake_useragent import FakeUserAgent
														
 
															-
														
 
															-def byte_dance_cookie(item_id):
														
 
															-    """
														
 
															-    获取西瓜视频的 cookie
														
 
															-    :param item_id:
														
 
															-    """
														
 
															-    sess = requests.Session()
														
 
															-    sess.headers.update({
														
 
															-        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
														
 
															-        'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
														
 
															-    })
														
 
															-
														
 
															-    # 获取 cookies
														
 
															-    sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
														
 
															-    data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
														
 
															-    r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
														
 
															-    # print(r.text)
														
 
															-    return r.cookies.values()[0]
														
 
															+from applications.functions.common import MySQLServer
														
 
															-def aes_decrypt(data: str, key: str) -> str:
														
 
															+class XiGuaFunctions(object):
														
 
															     """
														
 
															-    XiGua AES decrypt
														
 
															-    :param data:
														
 
															-    :param key:
														
 
															-    :return:
														
 
															+    XiGuaSearch Class
														
 
															     """
														
 
															-    password = key.encode()
														
 
															-    iv = password[:16]
														
 
															-    try:
														
 
															-        ct = base64.b64decode(data.encode())
														
 
															-        cipher = AES.new(password, AES.MODE_CBC, iv)
														
 
															-        pt = unpad(cipher.decrypt(ct), AES.block_size)
														
 
															-        return base64.b64decode(pt).decode()
														
 
															-    except Exception as e:
														
 
															-        print("Incorrect decryption {}".format(e))
														
 
															-        return None
														
 
															-
														
 
															-def extract_video_url(text):
														
 
															-    """
														
 
															-    获取视频 video_url
														
 
															-    :param text:
														
 
															-    :return:
														
 
															-    """
														
 
															-    HTML = etree.HTML(text)
														
 
															-    str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
														
 
															-    json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
														
 
															-    Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
														
 
															-    # python中不规则的定义
														
 
															-    for I in Irregulars:
														
 
															-        if I in ['=false', '=true']:
														
 
															-            json_2 = json_2.replace(I, '=' + I[1:].capitalize())
														
 
															+    @classmethod
														
 
															+    def tunnel_proxies(cls):
														
 
															+        """
														
 
															+            快代理方法
														
 
															+            :return:
														
 
															+            """
														
 
															+        tunnel = "q796.kdltps.com:15818"
														
 
															+        username = "t17772369458618"
														
 
															+        password = "5zqcjkmy"
														
 
															+        proxies = {
														
 
															+            "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
														
 
															+            "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
														
 
															+        }
														
 
															+        return proxies
														
 
															+
														
 
															+    @classmethod
														
 
															+    def byte_dance_cookie(cls, item_id):
														
 
															+        """
														
 
															+        获取西瓜视频的 cookie
														
 
															+        :param item_id:
														
 
															+        """
														
 
															+        sess = requests.Session()
														
 
															+        sess.headers.update({
														
 
															+            'user-agent': FakeUserAgent().chrome,
														
 
															+            'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
														
 
															+        })
														
 
															+
														
 
															+        # 获取 cookies
														
 
															+        sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
														
 
															+        data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
														
 
															+        r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
														
 
															+        if r.json()['redirect_url']:
														
 
															+            requests.get(
														
 
															+                url=r.json()['redirect_url']
														
 
															+            )
														
 
															+        return r.cookies.values()[0]
														
 
															+
														
 
															+    @classmethod
														
 
															+    def aes_decrypt(cls, data, key):
														
 
															+        """
														
 
															+        XiGua AES decrypt
														
 
															+        :param data:
														
 
															+        :param key:
														
 
															+        :return:
														
 
															+        """
														
 
															+        password = key.encode()
														
 
															+        iv = password[:16]
														
 
															+        try:
														
 
															+            ct = base64.b64decode(data.encode())
														
 
															+            cipher = AES.new(password, AES.MODE_CBC, iv)
														
 
															+            pt = unpad(cipher.decrypt(ct), AES.block_size)
														
 
															+            return base64.b64decode(pt).decode()
														
 
															+        except Exception as e:
														
 
															+            print("Incorrect decryption {}".format(e))
														
 
															+            return None
														
 
															+
														
 
															+    @classmethod
														
 
															+    def extract_video_url(cls, text):
														
 
															+        """
														
 
															+        获取视频 video_url
														
 
															+        :param text:
														
 
															+        :return:
														
 
															+        """
														
 
															+        HTML = etree.HTML(text)
														
 
															+        str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
														
 
															+        json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
														
 
															+        Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
														
 
															+        # python中不规则的定义
														
 
															+        for I in Irregulars:
														
 
															+            if I in ['=false', '=true']:
														
 
															+                json_2 = json_2.replace(I, '=' + I[1:].capitalize())
														
 
															+            else:
														
 
															+                json_2 = json_2.replace(I, '12')
														
 
															+        dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]
														
 
															+        duration = dict_2["video_duration"]
														
 
															+        play_cnt = dict_2['video_watch_count']
														
 
															+        publish_time = int(dict_2['video_publish_time'])
														
 
															+        like_cnt = dict_2['video_like_count']
														
 
															+        video_title = dict_2['title']
														
 
															+        video_id = dict_2['vid']
														
 
															+        video_res = dict_2['videoResource']
														
 
															+        cover_url = dict_2['poster_url'].replace("\\u002F", "/")
														
 
															+        if video_res['dash'] == 12:
														
 
															+            obj = video_res['normal']
														
 
															+            ptk = obj['ptk']
														
 
															+            video_list = obj['video_list']
														
 
															+            keys = list(video_list.keys())
														
 
															+            main_url = video_list[keys[-1]]['main_url']
														
 
															+            real_video_url = cls.aes_decrypt(data=main_url, key=ptk)
														
 
															         else:
														
 
															-            json_2 = json_2.replace(I, '12')
														
 
															-    dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]
														
 
															-    if dict_2['dash'] == 12:
														
 
															-        obj = dict_2['normal']
														
 
															-        ptk = obj['ptk']
														
 
															-        main_url = obj['video_list']['video_3']['main_url']
														
 
															-        real_video_url = aes_decrypt(data=main_url, key=ptk)
														
 
															-    else:
														
 
															-        obj = dict_2['dash']
														
 
															-        ptk = obj["ptk"]
														
 
															-        video_url = obj['dynamic_video']['main_url']
														
 
															-        real_video_url = aes_decrypt(data=video_url, key=ptk)
														
 
															-    return real_video_url
														
 
															-
														
 
															-
														
 
															-def extract_info_by_re(text):
														
 
															-    """
														
 
															-    通过正则表达式获取文本中的信息
														
 
															-    :param text:
														
 
															-    :return:
														
 
															-    """
														
 
															-    # 标题
														
 
															-    title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
														
 
															-    if title_match:
														
 
															-        title_content = title_match.group(1)
														
 
															-        title_content = title_content.split(" - ")[0]
														
 
															-        title_content = bytes(title_content, "latin1").decode()
														
 
															-    else:
														
 
															-        title_content = ""
														
 
															-
														
 
															-    # video_id
														
 
															-    video_id = re.search(r'"vid":"(.*?)"', text).group(1)
														
 
															-
														
 
															-    # like_count
														
 
															-    like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
														
 
															-
														
 
															-    # cover_url
														
 
															-    cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
														
 
															-
														
 
															-    # video_play
														
 
															-    video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
														
 
															-
														
 
															-    # "video_publish_time"
														
 
															-    publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
														
 
															-
														
 
															-    # video_duration
														
 
															-    duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
														
 
															-
														
 
															-    return {
														
 
															-        "title": title_content,
														
 
															-        "url": extract_video_url(text),
														
 
															-        "video_id": video_id,
														
 
															-        "like_count": like_count,
														
 
															-        "cover_url": cover_url,
														
 
															-        "play_count": video_watch_count,
														
 
															-        "publish_time": publish_time,
														
 
															-        "duration": duration
														
 
															-    }
														
 
															-
														
 
															-
														
 
															-def get_video_info(item_id):
														
 
															-    """
														
 
															-    获取视频信息
														
 
															-    """
														
 
															-    url = "https://www.ixigua.com/{}".format(item_id)
														
 
															-    headers = {
														
 
															-        "accept-encoding": "gzip, deflate",
														
 
															-        "accept-language": "zh-CN,zh-Hans;q=0.9",
														
 
															-        "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
														
 
															-        "user-agent": FakeUserAgent().random,
														
 
															-        "referer": "https://www.ixigua.com/{}/".format(item_id),
														
 
															-    }
														
 
															-    response = requests.get(
														
 
															-        url=url,
														
 
															-        headers=headers,
														
 
															-        # proxies=tunnel_proxies(),
														
 
															-        timeout=5,
														
 
															-    )
														
 
															-    time.sleep(random.randint(1, 5))
														
 
															-    video_info = extract_info_by_re(response.text)
														
 
															-
														
 
															-    video_dict = {
														
 
															-        "video_title": video_info.get("title", ""),
														
 
															-        "video_id": video_info.get("video_id"),
														
 
															-        "gid": str(item_id),
														
 
															-        "play_cnt": int(video_info.get("play_count", 0)),
														
 
															-        "like_cnt": int(video_info.get("like_count", 0)),
														
 
															-        "comment_cnt": 0,
														
 
															-        "share_cnt": 0,
														
 
															-        "favorite_cnt": 0,
														
 
															-        "duration": int(video_info.get("duration", 0)),
														
 
															-        "video_width": 0,
														
 
															-        "video_height": 0,
														
 
															-        "publish_time_stamp": int(video_info.get("publish_time", 0)),
														
 
															-        "publish_time_str": time.strftime(
														
 
															-            "%Y-%m-%d %H:%M:%S",
														
 
															-            time.localtime(int(video_info.get("publish_time", 0))),
														
 
															-        ),
														
 
															-        "avatar_url": str(
														
 
															-            video_info.get("user_info", {}).get("avatar_url", "")
														
 
															-        ),
														
 
															-        "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
														
 
															-        "video_url": video_info.get("url"),
														
 
															-        "session": f"xigua-author-{int(time.time())}",
														
 
															-    }
														
 
															-    return video_dict
														
 
															+            obj = video_res['dash']
														
 
															+            ptk = obj["ptk"]
														
 
															+            video_url = obj['dynamic_video']['main_url']
														
 
															+            real_video_url = cls.aes_decrypt(data=video_url, key=ptk)
														
 
															+        return {
														
 
															+            "video_url": real_video_url,
														
 
															+            "cover_url": cover_url,
														
 
															+            "video_id": video_id,
														
 
															+            "video_title": video_title,
														
 
															+            "like_cnt": like_cnt,
														
 
															+            "play_cnt": play_cnt,
														
 
															+            "publish_time": publish_time,
														
 
															+            "duration": duration
														
 
															+        }
														
 
															+
														
 
															+    @classmethod
														
 
															+    def extract_info_by_re(cls, text):
														
 
															+        """
														
 
															+        通过正则表达式获取文本中的信息
														
 
															+        :param text:
														
 
															+        :return:
														
 
															+        """
														
 
															+        result = cls.extract_video_url(text)
														
 
															+        # 标题
														
 
															+        title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
														
 
															+        if title_match:
														
 
															+            title_content = title_match.group(1)
														
 
															+            title_content = title_content.split(" - ")[0]
														
 
															+            try:
														
 
															+                title_content = bytes(title_content, "latin1").decode()
														
 
															+            except:
														
 
															+                title_content = title_content
														
 
															+        else:
														
 
															+            title_content = ""
														
 
															+        result['video_title'] = title_content
														
 
															+        return result
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_video_info(cls, item_id):
														
 
															+        """
														
 
															+        获取视频信息
														
 
															+        """
														
 
															+        url = "https://www.ixigua.com/{}".format(item_id)
														
 
															+        headers = {
														
 
															+            "accept-encoding": "gzip, deflate",
														
 
															+            "accept-language": "zh-CN,zh-Hans;q=0.9",
														
 
															+            "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
														
 
															+            "user-agent": FakeUserAgent().random,
														
 
															+            "referer": "https://www.ixigua.com/{}/".format(item_id),
														
 
															+        }
														
 
															+        response = requests.get(
														
 
															+            url=url,
														
 
															+            headers=headers
														
 
															+        )
														
 
															+        video_info = cls.extract_info_by_re(response.text)
														
 
															+        return video_info
														
 
															 def xigua_search(keyword):
														
 
															     """
														
 
															     搜索
														
 
															     """
														
 
															+    sensitive_words = MySQLServer().select_sensitive_words()
														
 
															+
														
 
															+    def sensitive_flag(s_words, ori_title):
														
 
															+        """
														
 
															+        :param ori_title:
														
 
															+        :param s_words:
														
 
															+        :return:
														
 
															+        """
														
 
															+        for word in s_words:
														
 
															+            if word in ori_title:
														
 
															+                return False
														
 
															+        return True
														
 
															+
														
 
															     keyword = urllib.parse.quote(keyword)
														
 
															     base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
														
 
															         keyword
														
@@ -191,13 +196,6 @@ def xigua_search(keyword):
 
															         "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
														
 
															         "cache-control": "max-age=0",
														
 
															         "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
														
 
															-        "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
														
 
															-        "sec-ch-ua-mobile": "?0",
														
 
															-        "sec-ch-ua-platform": '"macOS"',
														
 
															-        "sec-fetch-dest": "document",
														
 
															-        "sec-fetch-mode": "navigate",
														
 
															-        "sec-fetch-site": "none",
														
 
															-        "sec-fetch-user": "?1",
														
 
															         "upgrade-insecure-requests": "1",
														
 
															         "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
														
 
															     }
														
@@ -206,14 +204,20 @@ def xigua_search(keyword):
 
															     result = html.xpath(
														
 
															         '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
														
 
															     )
														
 
															-    res_list = []
														
 
															-    for page_id in result[:5]:
														
 
															-        doc_id = page_id[1:].split("?")[0]
														
 
															-        try:
														
 
															-            res = get_video_info(doc_id)
														
 
															-            temp = ["xigua", res['video_title'], res['video_url'], "https://www.ixigua.com/{}".format(doc_id)]
														
 
															-            res_list.append(temp)
														
 
															-        except:
														
 
															-            pass
														
 
															-    return res_list
														
 
															-
														
 
															+    if result:
														
 
															+        L = []
														
 
															+        doc_id_list = [page_id[1:] for page_id in result]
														
 
															+        for doc_id in doc_id_list:
														
 
															+            try:
														
 
															+                video_d = XiGuaFunctions().get_video_info(doc_id)
														
 
															+                video_title = video_d['video_title']
														
 
															+                if sensitive_flag(sensitive_words, video_title) and int(video_d['duration']) <= 300:
														
 
															+                    L.append(video_d)
														
 
															+                else:
														
 
															+                    continue
														
 
															+            except Exception as e:
														
 
															+                print(e)
														
 
															+                continue
														
 
															+        return L
														
 
															+    else:
														
 
															+        return []
														
--- a/applications/static/config.py
+++ b/applications/static/config.py
@@ -299,4 +299,29 @@ gh_id_dict = {
 
															         "uid": 69637480,
														
 
															         "nick_name": "风间"
														
 
															     }
														
 
															-}
														
 
															+}
														
 
															+
														
 
															+sensitive_words = [
														
 
															+    "台湾",
														
 
															+    "南海",
														
 
															+    "强奸",
														
 
															+    "寂寞难耐",
														
 
															+    "欲求不满",
														
 
															+    "不雅视频",
														
 
															+    "人妻",
														
 
															+    "侵犯",
														
 
															+    "正部级",
														
 
															+    "外长",
														
 
															+    "邓小平",
														
 
															+    "林彪",
														
 
															+    "李先念",
														
 
															+    "毛主席",
														
 
															+    "毛泽东",
														
 
															+    "江青",
														
 
															+    "朱镕基",
														
 
															+    "胡耀邦",
														
 
															+    "政治局",
														
 
															+    "省委书记",
														
 
															+    "国防部长",
														
 
															+    "外交部长"
														
 
															+]
														
--- a/dev/test.py
+++ b/dev/test.py
@@ -10,7 +10,7 @@ body = {
 
															     "content": "",
														
 
															     "cover": "",
														
 
															     "ghId": "gh_d2cc901deca7",
														
 
															-    "title": "江泽民"
														
 
															+    "title": "掘金大战森林狼"
														
 
															 }
														
 
															 a = time.time()
														
 
															 header = {
														
--- a/dev/test_search.py
+++ b/dev/test_search.py
@@ -0,0 +1,15 @@
 
															+"""
														
 
															+@author: luojunhui
														
 
															+"""
														
 
															+from applications.search import *
														
 
															+
														
 
															+keys = "湖人大战勇士"
														
 
															+
														
 
															+wx_result = wx_search(keys)
														
 
															+print(wx_result)
														
 
															+
														
 
															+xg_result = xigua_search(keys)
														
 
															+print(xg_result)
														
 
															+
														
 
															+baidu_result = hksp_search(keys)
														
 
															+print(baidu_result)