浏览代码

2024-05-13
多平台搜索上线(微信, 百度, 西瓜)
弃用 search_keys, theme
搜索结果过pq敏感词策略

罗俊辉 1 年之前
父节点
当前提交
eebeaef171

+ 36 - 31
applications/functions/common.py

@@ -3,6 +3,7 @@
 """
 import os
 import json
+import time
 import uuid
 import requests
 import pymysql
@@ -17,21 +18,6 @@ class Functions(object):
     通用工具代码
     """
 
-    # 敏感词逻辑
-    @classmethod
-    def sensitive_flag(cls, title):
-        """
-        判断标题是否命中过滤词
-        :param title:
-        :return:
-        """
-        sensitive_words = MySQLServer().select_sensitive_words()
-        for word in sensitive_words:
-            if word in title:
-                # title = title.replace(word, "*")
-                return False
-        return True
-
     # 自动加入白名单逻辑
     @classmethod
     def auto_white(cls, root_share_id):
@@ -155,7 +141,8 @@ class MySQLServer(object):
         :param trace_id:
         :return:
         """
-        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id, trace_id)
+        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id,
+                                                                                                           trace_id)
         connection = pymysql.connect(
             host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
             port=3306,  # 端口号
@@ -168,20 +155,12 @@ class MySQLServer(object):
         cursor.execute(sql)
         out_video_list = cursor.fetchall()
         if len(out_video_list) > 0:
-            vid_list = [i[0] for i in out_video_list if i[0] != 0]
-            vid_list = [vid_list[0]]
-            # dir_path = os.path.join(os.getcwd(), 'applications', 'static', "out_videos")
-            # os.makedirs(os.path.dirname(dir_path), exist_ok=True)
-            # done_list = os.listdir(dir_path)
-            # process_list = [
-            #     (
-            #         i[1],
-            #         trace_id,
-            #         os.path.join(dir_path, "{}.json".format(i[0]))
-            #     ) for i in out_video_list if not "{}.json".format(i[0]) in done_list
-            # ]
-            # if process_list:
-            #     ask_kimi_and_save_to_local(process_list[0])
+            if out_video_list[0][0] == 0:
+                video_id = cls.search_id_to_video(trace_id)
+            else:
+                video_id = out_video_list[0][0]
+
+            vid_list = [video_id]
             logging(
                 code="2003",
                 trace_id=trace_id,
@@ -252,6 +231,32 @@ class MySQLServer(object):
         result = [line[0] for line in data]
         return result
 
+    @classmethod
+    def search_id_to_video(cls, trace_id):
+        """
+        通过 search_id 返回 video_id
+        :param trace_id:
+        :return:
+        """
+        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id,
+                                                                                                           trace_id)
+        connection = pymysql.connect(
+            host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
+            port=3306,  # 端口号
+            user="crawler",  # mysql用户名
+            passwd="crawler123456@",  # mysql用户登录密码
+            db="piaoquan-crawler",  # 数据库名
+            charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+        )
+        cursor = connection.cursor()
+        cursor.execute(sql)
+        out_video_list = cursor.fetchall()
+        if int(out_video_list[0][0]) == 0:
+            time.sleep(1)
+            return cls.search_id_to_video(trace_id)
+        else:
+            return out_video_list[0][0]
+
 
 class KimiServer(object):
     """
@@ -364,4 +369,4 @@ class KimiServer(object):
             model="moonshot-v1-8k",
         )
         response = chat_completion.choices[0].message.content
-        return response
+        return response

+ 0 - 98
applications/functions/item.py

@@ -1,98 +0,0 @@
-"""
-@author: luojunhui
-"""
-import time
-
-from applications.functions.common import Functions
-
-
-class VideoItem(object):
-    """
-    function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
-    __init__: 初始化空json 对象,用来存储视频信息
-    add_video_info: 把视频信息存储到 item 对象中
-    check_item: 检查 item 对象中的各个元素以及处理
-    """
-
-    def __init__(self):
-        self.item = {}
-
-    def add_video_info(self, key, value):
-        self.item[key] = value
-
-    def check_item(self):
-        """
-        判断item 里面的字段,是否符合要求
-        字段分为 3 类:
-        1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
-        2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
-        3. 需要后出理的字段: video_title, publish_time
-        """
-        if self.item.get("video_title"):
-            self.item["video_title"] = Functions().clean_title(self.item["video_title"])
-        else:
-            return False
-        if self.item.get("publish_time_stamp"):
-            publish_time_str = time.strftime(
-                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
-            )
-            self.add_video_info("publish_time_str", publish_time_str)
-        else:
-            publish_time_stamp = int(time.time())
-            publish_time_str = time.strftime(
-                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
-            )
-            self.add_video_info("publish_time_stamp", publish_time_stamp)
-            self.add_video_info("publish_time_str", publish_time_str)
-        self.add_video_info("publish_time", publish_time_str)
-        if not self.item.get("update_time_stamp"):
-            self.add_video_info("update_time_stamp", int(time.time()))
-
-        # 如果不存在,默认值为 0
-        config_keys = [
-            "duration",
-            "play_cnt",
-            "like_cnt",
-            "comment_cnt",
-            "share_cnt",
-            "width",
-            "height",
-        ]
-        for config_key in config_keys:
-            if self.item.get(config_key):
-                continue
-            else:
-                self.add_video_info(config_key, 0)
-
-        # 必须存在的元素,若不存在则会报错
-        must_keys = [
-            "video_id",
-            "user_id",
-            "user_name",
-            "out_video_id",
-            "session",
-            "video_url",
-            "cover_url",
-            "platform",
-            "strategy",
-        ]
-        """
-        video_id, out_video_id 均为站外视频 id
-        usr_id: 站内用户 id
-        out_user_id: 站外用户 id
-        user_name: 站外用户名称
-        """
-        for m_key in must_keys:
-            if self.item.get(m_key):
-                continue
-            else:
-                # print(m_key)
-                return False
-        return True
-
-    def produce_item(self):
-        flag = self.check_item()
-        if flag:
-            return self.item
-        else:
-            return False

+ 244 - 0
applications/functions/video_item.py

@@ -0,0 +1,244 @@
+"""
+@author: luojunhui
+"""
+import time
+from applications.functions.mq import MQ
+from applications.functions.log import logging
+from applications.functions.common import Functions
+
+
+class VideoItem(object):
+    """
+    function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
+    __init__: 初始化空json 对象,用来存储视频信息
+    add_video_info: 把视频信息存储到 item 对象中
+    check_item: 检查 item 对象中的各个元素以及处理
+    """
+
+    def __init__(self):
+        self.item = {}
+
+    def add_video_info(self, key, value):
+        """
+        insert or update video info
+        :param key:
+        :param value:
+        """
+        self.item[key] = value
+
+    def check_item(self):
+        """
+        判断item 里面的字段,是否符合要求
+        字段分为 3 类:
+        1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
+        2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
+        3. 需要后出理的字段: video_title, publish_time
+        """
+        if self.item.get("video_title"):
+            self.item["video_title"] = Functions().clean_title(self.item["video_title"])
+        else:
+            return False
+        if self.item.get("publish_time_stamp"):
+            publish_time_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
+            )
+            self.add_video_info("publish_time_str", publish_time_str)
+        else:
+            publish_time_stamp = int(time.time())
+            publish_time_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
+            )
+            self.add_video_info("publish_time_stamp", publish_time_stamp)
+            self.add_video_info("publish_time_str", publish_time_str)
+        self.add_video_info("publish_time", publish_time_str)
+        if not self.item.get("update_time_stamp"):
+            self.add_video_info("update_time_stamp", int(time.time()))
+
+        # 如果不存在,默认值为 0
+        config_keys = [
+            "duration",
+            "play_cnt",
+            "like_cnt",
+            "comment_cnt",
+            "share_cnt",
+            "width",
+            "height",
+        ]
+        for config_key in config_keys:
+            if self.item.get(config_key):
+                continue
+            else:
+                self.add_video_info(config_key, 0)
+
+        # 必须存在的元素,若不存在则会报错
+        must_keys = [
+            "video_id",
+            "user_id",
+            "user_name",
+            "out_video_id",
+            "session",
+            "video_url",
+            "cover_url",
+            "platform",
+            "strategy",
+        ]
+        """
+        video_id, out_video_id 均为站外视频 id
+        usr_id: 站内用户 id
+        out_user_id: 站外用户 id
+        user_name: 站外用户名称
+        """
+        for m_key in must_keys:
+            if self.item.get(m_key):
+                continue
+            else:
+                # print(m_key)
+                return False
+        return True
+
+    def produce_item(self):
+        """
+        item producer
+        :return:
+        """
+        flag = self.check_item()
+        if flag:
+            return self.item
+        else:
+            return False
+
+
+class VideoProducer(object):
+    """
+    处理视频
+    todo: baidu && xigua video process
+    """
+
+    @classmethod
+    def wx_video_producer(cls, video_obj, user, trace_id):
+        """
+            异步处理微信 video_obj
+            公众号和站内账号一一对应
+            :param trace_id:
+            :param user:
+            :param video_obj:
+            :return:
+        """
+        platform = "weixin_search"
+        publish_time_stamp = int(video_obj['pubTime'])
+        item = VideoItem()
+        item.add_video_info("user_id", user["uid"])
+        item.add_video_info("user_name", user["nick_name"])
+        item.add_video_info("video_id", video_obj['hashDocID'])
+        item.add_video_info("video_title", trace_id)
+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
+        item.add_video_info("video_url", video_obj["videoUrl"])
+        item.add_video_info("cover_url", video_obj["image"])
+        item.add_video_info("out_video_id", video_obj['hashDocID'])
+        item.add_video_info("out_user_id", trace_id)
+        item.add_video_info("platform", platform)
+        item.add_video_info("strategy", "search")
+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
+        mq_obj = item.produce_item()
+        return mq_obj
+
+    @classmethod
+    def baidu_video_producer(cls, video_obj, user, trace_id):
+        """
+        处理好看视频的 video_info
+        :param video_obj:
+        :param user:
+        :param trace_id:
+        :return:
+        """
+        platform = "baidu_search"
+        publish_time_stamp = int(video_obj['publish_time'])
+        item = VideoItem()
+        item.add_video_info("user_id", user["uid"])
+        item.add_video_info("user_name", user["nick_name"])
+        item.add_video_info("video_id", video_obj['id'])
+        item.add_video_info("video_title", trace_id)
+        item.add_video_info("publish_time_stamp", publish_time_stamp)
+        item.add_video_info("video_url", video_obj["playurl"])
+        item.add_video_info("cover_url", video_obj["poster"])
+        item.add_video_info("out_video_id", video_obj['id'])
+        item.add_video_info("out_user_id", trace_id)
+        item.add_video_info("like_cnt", video_obj['like'])
+        item.add_video_info("play_cnt", video_obj['playcnt'])
+        item.add_video_info("duration", video_obj['duration'])
+        item.add_video_info("platform", platform)
+        item.add_video_info("strategy", "search")
+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
+        mq_obj = item.produce_item()
+        return mq_obj
+
+    @classmethod
+    def xg_video_producer(cls, video_obj, user, trace_id):
+        """
+        西瓜搜索
+        :param video_obj:
+        :param user:
+        :param trace_id:
+        :return:
+        """
+        platform = "xg_search"
+        publish_time_stamp = int(video_obj['publish_time'])
+        item = VideoItem()
+        item.add_video_info("user_id", user["uid"])
+        item.add_video_info("user_name", user["nick_name"])
+        item.add_video_info("video_id", video_obj['video_id'])
+        item.add_video_info("video_title", trace_id)
+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
+        item.add_video_info("video_url", video_obj["video_url"])
+        item.add_video_info("cover_url", video_obj["cover_url"])
+        item.add_video_info("out_video_id", video_obj['video_id'])
+        item.add_video_info("play_cnt", video_obj['play_cnt'])
+        item.add_video_info("duration", video_obj['duration'])
+        item.add_video_info("like_cnt", video_obj['like_cnt'])
+        item.add_video_info("out_user_id", trace_id)
+        item.add_video_info("platform", platform)
+        item.add_video_info("strategy", "search")
+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
+        mq_obj = item.produce_item()
+        return mq_obj
+
+
+def video_mq_sender(video_obj, user, trace_id, platform):
+    """
+    异步处理微信 video_obj
+    公众号和站内账号一一对应
+    :param platform:
+    :param user:
+    :param trace_id:
+    :param video_obj:
+    :return:
+    """
+    ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
+    Video = VideoProducer()
+    if platform == "xg_search":
+        mq_obj = Video.xg_video_producer(
+            video_obj=video_obj,
+            user=user,
+            trace_id=trace_id,
+        )
+    elif platform == "baidu_search":
+        mq_obj = Video.baidu_video_producer(
+            video_obj=video_obj,
+            user=user,
+            trace_id=trace_id,
+        )
+    elif platform == "wx_search":
+        mq_obj = Video.wx_video_producer(
+            video_obj=video_obj,
+            user=user,
+            trace_id=trace_id,
+        )
+    else:
+        mq_obj = {}
+    ETL_MQ.send_msg(params=mq_obj)
+    logging(
+        code="6002",
+        info="发送消息至 ETL",
+        data=mq_obj,
+        trace_id=trace_id
+    )

+ 30 - 30
applications/routes.py

@@ -35,7 +35,7 @@ async def search_videos_from_the_web():
     :return:
     """
     params = await request.get_json()
-    title = params['title']
+    title = params['title'].replace("【非头次】", "")
     gh_id = params['ghId']
     trace_id = "search-{}-{}".format(str(uuid.uuid4()), str(int(time.time())))
     params['trace_id'] = trace_id
@@ -46,36 +46,36 @@ async def search_videos_from_the_web():
         function="search_videos_from_the_web",
         trace_id=trace_id
     )
-    try:
-        title_p = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
-        if os.path.exists(title_p):
-            logging(
-                code="2001",
-                info="该标题已经被 kimi 处理过,跳过请求 kimi 操作--- {}".format(title),
-                function="search_videos_from_the_web",
-                trace_id=trace_id
-            )
-        else:
-            KimiServer().ask_kimi_and_save_to_local((title, trace_id, title_p))
-        await asyncio.sleep(1)
-        kimi_title = KimiServer().kimi_title(title)
-        search_videos(
-            title=title,
-            video_path=title_p,
-            trace_id=trace_id,
-            gh_id=gh_id,
+    # try:
+    title_p = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
+    if os.path.exists(title_p):
+        logging(
+            code="2001",
+            info="该标题已经被 kimi 处理过,跳过请求 kimi 操作--- {}".format(title),
+            function="search_videos_from_the_web",
+            trace_id=trace_id
         )
-        res = {
-            "trace_id": trace_id,
-            "code": 0,
-            "kimi_title": kimi_title
-        }
-    except Exception as e:
-        res = {
-            "trace_id": trace_id,
-            "code": 1,
-            "message": str(e)
-        }
+    else:
+        KimiServer().ask_kimi_and_save_to_local((title, trace_id, title_p))
+    await asyncio.sleep(1)
+    kimi_title = KimiServer().kimi_title(title)
+    search_videos(
+        title=title,
+        video_path=title_p,
+        trace_id=trace_id,
+        gh_id=gh_id,
+    )
+    res = {
+        "trace_id": trace_id,
+        "code": 0,
+        "kimi_title": kimi_title
+    }
+    # except Exception as e:
+    #     res = {
+    #         "trace_id": trace_id,
+    #         "code": 1,
+    #         "message": str(e)
+    #     }
     return jsonify(res)
 
 

+ 0 - 1
applications/schedule/process_schedule.py

@@ -3,7 +3,6 @@
 对请求进行操作
 """
 import json
-import time
 import os
 
 from applications.match_alg import best_choice

+ 48 - 136
applications/schedule/search_schedule.py

@@ -3,69 +3,14 @@
 调用接口在微信内搜索视频
 """
 import json
-import time
-import requests
 
-from applications.functions.mq import MQ
-from applications.functions.log import logging
+from applications.search import *
 from applications.static.config import gh_id_dict
-from applications.functions.item import VideoItem
-
-
-def wx_search(keys):
-    """
-    WeChat search
-    :param keys:
-    :return:
-    """
-    url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
-    payload = json.dumps({
-        "keyword": keys,
-        "cursor": "0",
-        "content_type": "video"
-    })
-    headers = {
-        'Content-Type': 'application/json'
-    }
-    response = requests.request("POST", url, headers=headers, data=payload)
-    return response.json()
-
-
-def process_weixin_video_obj(video_obj, user, trace_id):
-    """
-    异步处理微信 video_obj
-    公众号和站内账号一一对应
-    :param trace_id:
-    :param user:
-    :param video_obj:
-    :return:
-    """
-    ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
-    platform = "weixin_search"
-    publish_time_stamp = int(video_obj['pubTime'])
-    item = VideoItem()
-    item.add_video_info("user_id", user["uid"])
-    item.add_video_info("user_name", user["nick_name"])
-    item.add_video_info("video_id", video_obj['hashDocID'])
-    item.add_video_info("video_title", trace_id)
-    item.add_video_info("publish_time_stamp", int(publish_time_stamp))
-    item.add_video_info("video_url", video_obj["videoUrl"])
-    item.add_video_info("cover_url", video_obj["image"])
-    item.add_video_info("out_video_id", video_obj['hashDocID'])
-    item.add_video_info("out_user_id", trace_id)
-    item.add_video_info("platform", platform)
-    item.add_video_info("strategy", "search")
-    item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
-    mq_obj = item.produce_item()
-    ETL_MQ.send_msg(params=mq_obj)
-    logging(
-        code="6002",
-        info="发送消息至 ETL",
-        data=mq_obj
-    )
+from applications.functions.log import logging
+from applications.functions.video_item import video_mq_sender
 
 
-def return_video(video_path, title, trace_id):
+def recall_search_video(video_path, title, trace_id):
     """
     search and send msg to ETL
     :param trace_id:
@@ -76,81 +21,45 @@ def return_video(video_path, title, trace_id):
     with open(video_path, encoding='utf-8') as f:
         my_obj = json.loads(f.read())
     if my_obj:
-        # 三者都搜索,优先搜索 title
-        title_result = wx_search(keys=title)
-        if title_result['msg'] == '未知错误':
-            logging(
-                code="7001",
-                info="通过标题搜索失败---{}".format(title),
-                trace_id=trace_id
-            )
-        else:
-            obj_list = title_result['data']['data']
-            if obj_list:
-                return obj_list[0]
-            # for obj in obj_list:
-            #     try:
-            #         title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
-            #                                                                                      '').replace("#",
-            #                                                                                             "")
-            #         if Functions().sensitive_flag(title):
-            #             return obj
-            #         else:
-            #             continue
-            #     except Exception as e:
-            #         print(e)
-            #         continue
-
-        # # search_keys
-        search_keys_result = wx_search(keys=my_obj['search_keys'][0])
-        if search_keys_result['msg'] == '未知错误':
-            logging(
-                code="7001",
-                info="通过搜索词搜索失败---{}".format(title),
-                trace_id=trace_id
-            )
+        wx_result = wx_search(keys=title)
+        if wx_result:
+            return {
+                "platform": "wx_search",
+                "result": wx_result[0]
+            }
         else:
-            obj_list = search_keys_result['data']['data']
-            if obj_list:
-                return obj_list[0]
-        #     for obj in obj_list:
-        #         try:
-        #             title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
-        #                                                                                              '').replace("#",
-        #                                                                                                          "")
-        #             if Functions().sensitive_flag(title):
-        #                 return obj
-        #             else:
-        #                 continue
-        #         except Exception as e:
-        #             print(e)
-        #             continue
-
-        # theme
-        theme_result = wx_search(keys=my_obj['theme'])
-        if theme_result['msg'] == '未知错误':
             logging(
                 code="7001",
-                info="通过主题搜索失败---{}".format(title),
+                info="通过微信搜索失败---{}".format(title),
                 trace_id=trace_id
             )
-        else:
-            obj_list = theme_result['data']['data']
-            if obj_list:
-                return obj_list[0]
-            # for obj in obj_list:
-            #     try:
-            #         title = obj['items'][0]['title'].replace('<em class=\"highlight\">', '').replace('</em>',
-            #                                                                                          '').replace("#",
-            #                                                                                                      "")
-            #         if Functions().sensitive_flag(title):
-            #             return obj
-            #         else:
-            #             continue
-            #     except Exception as e:
-            #         print(e)
-            #         continue
-        return None
+            # 微信搜不到的话,采用好看视频搜索
+            baidu_result = hksp_search(key=title)
+            if baidu_result:
+                return {
+                    "platform": "baidu_search",
+                    "result": baidu_result[0]
+                }
+            else:
+                # 若好看视频未搜到,则采用西瓜搜索
+                logging(
+                    code="7001",
+                    info="通过baidu搜索失败---{}".format(title),
+                    trace_id=trace_id
+                )
+                xigua_result = xigua_search(title)
+                if xigua_result:
+                    return {
+                        "platform": "xg_search",
+                        "result": xigua_result[0]
+                    }
+                else:
+                    logging(
+                        code="7001",
+                        info="通过西瓜搜索失败---{}".format(title),
+                        trace_id=trace_id
+                    )
+                    return None
     else:
         logging(
             code="7000",
@@ -169,18 +78,21 @@ def search_videos(video_path, title, trace_id, gh_id):
     :param trace_id:
     :return:
     """
-    video_obj = return_video(video_path, title, trace_id)
-    if video_obj:
+    recall_obj = recall_search_video(video_path, title, trace_id)
+    platform = recall_obj["platform"]
+    recall_video = recall_obj["result"]
+    if recall_video:
         logging(
             code="7002",
-            info="视频搜索成功",
+            info="视频搜索成功, 搜索平台为--{}".format(platform),
             trace_id=trace_id,
-            data=video_obj
+            data=recall_video
         )
-        process_weixin_video_obj(
-            video_obj=video_obj['items'][0],
+        video_mq_sender(
+            video_obj=recall_video,
             user=gh_id_dict.get(gh_id),
-            trace_id=trace_id
+            trace_id=trace_id,
+            platform=platform
         )
     else:
         logging(

+ 22 - 17
applications/search/hksp_search.py

@@ -7,6 +7,8 @@ import urllib.parse
 import time
 import hashlib
 
+from applications.functions.common import MySQLServer
+
 
 def get_video_detail(video_id):
     """
@@ -35,7 +37,6 @@ def get_video_detail(video_id):
         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
     }
     response = requests.request("GET", url, headers=headers, params=params).json()
-    # print(json.dumps(response['data']['apiData']['curVideoMeta'], ensure_ascii=False, indent=4))
     return response['data']['apiData']['curVideoMeta']
 
 
@@ -43,6 +44,19 @@ def hksp_search(key):
     """
     好看视频搜索爬虫
     """
+    sensitive_words = MySQLServer().select_sensitive_words()
+
+    def sensitive_flag(s_words, ori_title):
+        """
+        :param ori_title:
+        :param s_words:
+        :return:
+        """
+        for word in s_words:
+            if word in ori_title:
+                return False
+        return True
+
     timestamp_seconds = time.time()
     timestamp_milliseconds = int(timestamp_seconds * 1000)
     url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
@@ -63,32 +77,23 @@ def hksp_search(key):
         'authority': 'haokan.baidu.com',
         'accept': '*/*',
         'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
-        'cookie': "BIDUPSID='",
-        # 'referer': 'https://haokan.baidu.com/web/search/page?query=%E8%80%81%E4%BA%BA',
-        'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
-        'sec-ch-ua-mobile': '?0',
-        'sec-ch-ua-platform': '"macOS"',
-        'sec-fetch-dest': 'empty',
-        'sec-fetch-mode': 'cors',
-        'sec-fetch-site': 'same-origin',
+        'cookie': "BIDUPSID=",
         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
         'x-requested-with': 'xmlhttprequest',
     }
     # 发送GET请求
     response = requests.get(url, headers=headers, params=params).json()
-    # print(json.dumps(response, ensure_ascii=False, indent=4))
     data_list = response['data']['list']
+
     L = []
-    for data in data_list[:5]:
+    for data in data_list:
         try:
             video_id = data['vid']
             res = get_video_detail(video_id)
-            temp = ["haokanshipin", res['title'], res['playurl'], "https://haokan.baidu.com/v?vid={}".format(video_id)]
-            L.append(temp)
+            if sensitive_flag(sensitive_words, ['title']) and int(res['duration']) <= 300:
+                L.append(res)
+            else:
+                continue
         except:
             pass
     return L
-
-
-if __name__ == '__main__':
-    hksp_search("美国竟对中国提出4个荒唐的条件,真是好大的口气")

+ 36 - 2
applications/search/weixin_search.py

@@ -4,6 +4,8 @@
 import json
 import requests
 
+from applications.functions.common import MySQLServer
+
 
 def wx_search(keys):
     """
@@ -11,6 +13,20 @@ def wx_search(keys):
     :param keys:
     :return:
     """
+
+    sensitive_words = MySQLServer().select_sensitive_words()
+
+    def sensitive_flag(s_words, ori_title):
+        """
+        :param ori_title:
+        :param s_words:
+        :return:
+        """
+        for word in s_words:
+            if word in ori_title:
+                return False
+        return True
+
     url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
     payload = json.dumps({
         "keyword": keys,
@@ -20,5 +36,23 @@ def wx_search(keys):
     headers = {
         'Content-Type': 'application/json'
     }
-    response = requests.request("POST", url, headers=headers, data=payload)
-    return response.json()
+    response = requests.request("POST", url, headers=headers, data=payload).json()
+    if response['msg'] == '未知错误':
+        return []
+    else:
+        L = []
+        if response['data']:
+            video_list = response['data']['data']
+            for video in video_list:
+                try:
+                    video_info = video['items'][0]
+                    title = video_info['title']
+                    duration_str = video_info['duration']
+                    dr = int(duration_str.split(":")[0].strip()) + int(duration_str.split(":")[1].strip())
+                    if sensitive_flag(sensitive_words, title) and dr <= 300:
+                        L.append(video_info)
+                    else:
+                        continue
+                except:
+                    pass
+        return L

+ 179 - 175
applications/search/xigua_search.py

@@ -4,183 +4,188 @@
 """
 import re
 import json
-import time
-import random
 import base64
+import requests
 import urllib.parse
 
-import requests
 from lxml import etree
 from Crypto.Cipher import AES
 from Crypto.Util.Padding import unpad
 from fake_useragent import FakeUserAgent
 
-
-def byte_dance_cookie(item_id):
-    """
-    获取西瓜视频的 cookie
-    :param item_id:
-    """
-    sess = requests.Session()
-    sess.headers.update({
-        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
-        'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
-    })
-
-    # 获取 cookies
-    sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
-    data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
-    r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
-    # print(r.text)
-    return r.cookies.values()[0]
+from applications.functions.common import MySQLServer
 
 
-def aes_decrypt(data: str, key: str) -> str:
+class XiGuaFunctions(object):
     """
-    XiGua AES decrypt
-    :param data:
-    :param key:
-    :return:
+    XiGuaSearch Class
     """
-    password = key.encode()
-    iv = password[:16]
-    try:
-        ct = base64.b64decode(data.encode())
-        cipher = AES.new(password, AES.MODE_CBC, iv)
-        pt = unpad(cipher.decrypt(ct), AES.block_size)
-        return base64.b64decode(pt).decode()
-    except Exception as e:
-        print("Incorrect decryption {}".format(e))
-        return None
-
 
-def extract_video_url(text):
-    """
-    获取视频 video_url
-    :param text:
-    :return:
-    """
-    HTML = etree.HTML(text)
-    str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
-    json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
-    Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
-    # python中不规则的定义
-    for I in Irregulars:
-        if I in ['=false', '=true']:
-            json_2 = json_2.replace(I, '=' + I[1:].capitalize())
+    @classmethod
+    def tunnel_proxies(cls):
+        """
+            快代理方法
+            :return:
+            """
+        tunnel = "q796.kdltps.com:15818"
+        username = "t17772369458618"
+        password = "5zqcjkmy"
+        proxies = {
+            "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+            "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+        }
+        return proxies
+
+    @classmethod
+    def byte_dance_cookie(cls, item_id):
+        """
+        获取西瓜视频的 cookie
+        :param item_id:
+        """
+        sess = requests.Session()
+        sess.headers.update({
+            'user-agent': FakeUserAgent().chrome,
+            'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
+        })
+
+        # 获取 cookies
+        sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
+        data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
+        r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
+        if r.json()['redirect_url']:
+            requests.get(
+                url=r.json()['redirect_url']
+            )
+        return r.cookies.values()[0]
+
+    @classmethod
+    def aes_decrypt(cls, data, key):
+        """
+        XiGua AES decrypt
+        :param data:
+        :param key:
+        :return:
+        """
+        password = key.encode()
+        iv = password[:16]
+        try:
+            ct = base64.b64decode(data.encode())
+            cipher = AES.new(password, AES.MODE_CBC, iv)
+            pt = unpad(cipher.decrypt(ct), AES.block_size)
+            return base64.b64decode(pt).decode()
+        except Exception as e:
+            print("Incorrect decryption {}".format(e))
+            return None
+
+    @classmethod
+    def extract_video_url(cls, text):
+        """
+        获取视频 video_url
+        :param text:
+        :return:
+        """
+        HTML = etree.HTML(text)
+        str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
+        json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
+        Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
+        # python中不规则的定义
+        for I in Irregulars:
+            if I in ['=false', '=true']:
+                json_2 = json_2.replace(I, '=' + I[1:].capitalize())
+            else:
+                json_2 = json_2.replace(I, '12')
+        dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]
+        duration = dict_2["video_duration"]
+        play_cnt = dict_2['video_watch_count']
+        publish_time = int(dict_2['video_publish_time'])
+        like_cnt = dict_2['video_like_count']
+        video_title = dict_2['title']
+        video_id = dict_2['vid']
+        video_res = dict_2['videoResource']
+        cover_url = dict_2['poster_url'].replace("\\u002F", "/")
+        if video_res['dash'] == 12:
+            obj = video_res['normal']
+            ptk = obj['ptk']
+            video_list = obj['video_list']
+            keys = list(video_list.keys())
+            main_url = video_list[keys[-1]]['main_url']
+            real_video_url = cls.aes_decrypt(data=main_url, key=ptk)
         else:
-            json_2 = json_2.replace(I, '12')
-    dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]
-    if dict_2['dash'] == 12:
-        obj = dict_2['normal']
-        ptk = obj['ptk']
-        main_url = obj['video_list']['video_3']['main_url']
-        real_video_url = aes_decrypt(data=main_url, key=ptk)
-    else:
-        obj = dict_2['dash']
-        ptk = obj["ptk"]
-        video_url = obj['dynamic_video']['main_url']
-        real_video_url = aes_decrypt(data=video_url, key=ptk)
-    return real_video_url
-
-
-def extract_info_by_re(text):
-    """
-    通过正则表达式获取文本中的信息
-    :param text:
-    :return:
-    """
-    # 标题
-    title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
-    if title_match:
-        title_content = title_match.group(1)
-        title_content = title_content.split(" - ")[0]
-        title_content = bytes(title_content, "latin1").decode()
-    else:
-        title_content = ""
-
-    # video_id
-    video_id = re.search(r'"vid":"(.*?)"', text).group(1)
-
-    # like_count
-    like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
-
-    # cover_url
-    cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
-
-    # video_play
-    video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
-
-    # "video_publish_time"
-    publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
-
-    # video_duration
-    duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
-
-    return {
-        "title": title_content,
-        "url": extract_video_url(text),
-        "video_id": video_id,
-        "like_count": like_count,
-        "cover_url": cover_url,
-        "play_count": video_watch_count,
-        "publish_time": publish_time,
-        "duration": duration
-    }
-
-
-def get_video_info(item_id):
-    """
-    获取视频信息
-    """
-    url = "https://www.ixigua.com/{}".format(item_id)
-    headers = {
-        "accept-encoding": "gzip, deflate",
-        "accept-language": "zh-CN,zh-Hans;q=0.9",
-        "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
-        "user-agent": FakeUserAgent().random,
-        "referer": "https://www.ixigua.com/{}/".format(item_id),
-    }
-    response = requests.get(
-        url=url,
-        headers=headers,
-        # proxies=tunnel_proxies(),
-        timeout=5,
-    )
-    time.sleep(random.randint(1, 5))
-    video_info = extract_info_by_re(response.text)
-
-    video_dict = {
-        "video_title": video_info.get("title", ""),
-        "video_id": video_info.get("video_id"),
-        "gid": str(item_id),
-        "play_cnt": int(video_info.get("play_count", 0)),
-        "like_cnt": int(video_info.get("like_count", 0)),
-        "comment_cnt": 0,
-        "share_cnt": 0,
-        "favorite_cnt": 0,
-        "duration": int(video_info.get("duration", 0)),
-        "video_width": 0,
-        "video_height": 0,
-        "publish_time_stamp": int(video_info.get("publish_time", 0)),
-        "publish_time_str": time.strftime(
-            "%Y-%m-%d %H:%M:%S",
-            time.localtime(int(video_info.get("publish_time", 0))),
-        ),
-        "avatar_url": str(
-            video_info.get("user_info", {}).get("avatar_url", "")
-        ),
-        "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
-        "video_url": video_info.get("url"),
-        "session": f"xigua-author-{int(time.time())}",
-    }
-    return video_dict
+            obj = video_res['dash']
+            ptk = obj["ptk"]
+            video_url = obj['dynamic_video']['main_url']
+            real_video_url = cls.aes_decrypt(data=video_url, key=ptk)
+        return {
+            "video_url": real_video_url,
+            "cover_url": cover_url,
+            "video_id": video_id,
+            "video_title": video_title,
+            "like_cnt": like_cnt,
+            "play_cnt": play_cnt,
+            "publish_time": publish_time,
+            "duration": duration
+        }
+
+    @classmethod
+    def extract_info_by_re(cls, text):
+        """
+        通过正则表达式获取文本中的信息
+        :param text:
+        :return:
+        """
+        result = cls.extract_video_url(text)
+        # 标题
+        title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
+        if title_match:
+            title_content = title_match.group(1)
+            title_content = title_content.split(" - ")[0]
+            try:
+                title_content = bytes(title_content, "latin1").decode()
+            except:
+                title_content = title_content
+        else:
+            title_content = ""
+        result['video_title'] = title_content
+        return result
+
+    @classmethod
+    def get_video_info(cls, item_id):
+        """
+        获取视频信息
+        """
+        url = "https://www.ixigua.com/{}".format(item_id)
+        headers = {
+            "accept-encoding": "gzip, deflate",
+            "accept-language": "zh-CN,zh-Hans;q=0.9",
+            "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
+            "user-agent": FakeUserAgent().random,
+            "referer": "https://www.ixigua.com/{}/".format(item_id),
+        }
+        response = requests.get(
+            url=url,
+            headers=headers
+        )
+        video_info = cls.extract_info_by_re(response.text)
+        return video_info
 
 
 def xigua_search(keyword):
     """
     搜索
     """
+    sensitive_words = MySQLServer().select_sensitive_words()
+
+    def sensitive_flag(s_words, ori_title):
+        """
+        :param ori_title:
+        :param s_words:
+        :return:
+        """
+        for word in s_words:
+            if word in ori_title:
+                return False
+        return True
+
     keyword = urllib.parse.quote(keyword)
     base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
         keyword
@@ -191,13 +196,6 @@ def xigua_search(keyword):
         "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
         "cache-control": "max-age=0",
         "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
-        "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
-        "sec-ch-ua-mobile": "?0",
-        "sec-ch-ua-platform": '"macOS"',
-        "sec-fetch-dest": "document",
-        "sec-fetch-mode": "navigate",
-        "sec-fetch-site": "none",
-        "sec-fetch-user": "?1",
         "upgrade-insecure-requests": "1",
         "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
     }
@@ -206,14 +204,20 @@ def xigua_search(keyword):
     result = html.xpath(
         '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
     )
-    res_list = []
-    for page_id in result[:5]:
-        doc_id = page_id[1:].split("?")[0]
-        try:
-            res = get_video_info(doc_id)
-            temp = ["xigua", res['video_title'], res['video_url'], "https://www.ixigua.com/{}".format(doc_id)]
-            res_list.append(temp)
-        except:
-            pass
-    return res_list
-
+    if result:
+        L = []
+        doc_id_list = [page_id[1:] for page_id in result]
+        for doc_id in doc_id_list:
+            try:
+                video_d = XiGuaFunctions().get_video_info(doc_id)
+                video_title = video_d['video_title']
+                if sensitive_flag(sensitive_words, video_title) and int(video_d['duration']) <= 300:
+                    L.append(video_d)
+                else:
+                    continue
+            except Exception as e:
+                print(e)
+                continue
+        return L
+    else:
+        return []

+ 26 - 1
applications/static/config.py

@@ -299,4 +299,29 @@ gh_id_dict = {
         "uid": 69637480,
         "nick_name": "风间"
     }
-}
+}
+
+sensitive_words = [
+    "台湾",
+    "南海",
+    "强奸",
+    "寂寞难耐",
+    "欲求不满",
+    "不雅视频",
+    "人妻",
+    "侵犯",
+    "正部级",
+    "外长",
+    "邓小平",
+    "林彪",
+    "李先念",
+    "毛主席",
+    "毛泽东",
+    "江青",
+    "朱镕基",
+    "胡耀邦",
+    "政治局",
+    "省委书记",
+    "国防部长",
+    "外交部长"
+]

+ 1 - 1
test.py → dev/test.py

@@ -10,7 +10,7 @@ body = {
     "content": "",
     "cover": "",
     "ghId": "gh_d2cc901deca7",
-    "title": "江泽民"
+    "title": "掘金大战森林狼"
 }
 a = time.time()
 header = {

+ 15 - 0
dev/test_search.py

@@ -0,0 +1,15 @@
+"""
+@author: luojunhui
+"""
+from applications.search import *
+
+keys = "湖人大战勇士"
+
+wx_result = wx_search(keys)
+print(wx_result)
+
+xg_result = xigua_search(keys)
+print(xg_result)
+
+baidu_result = hksp_search(keys)
+print(baidu_result)