Browse Source

2024-05-05
上线抖音搜索

罗俊辉 10 months ago
parent
commit
277d640b5e

+ 20 - 2
applications/functions/async_etl.py

@@ -49,7 +49,7 @@ class AsyncETL(object):
         }
         self.max_retry = 5
 
-    def request_header(self):
+    def request_header(self, type_="video"):
         """
         请求头
         :return:
@@ -73,6 +73,15 @@ class AsyncETL(object):
                     "Origin": "https://www.ixigua.com/",
                     "Referer": "https://www.ixigua.com/"
                 }
+            elif type_ == "cover":
+                headers = {
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                    'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
+                    'Cache-Control': 'max-age=0',
+                    'Proxy-Connection': 'keep-alive',
+                    'Upgrade-Insecure-Requests': '1',
+                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
+                }
             else:
                 headers = {
                     "Accept": "*/*",
@@ -96,6 +105,15 @@ class AsyncETL(object):
                 "Origin": "https://mp.weixin.qq.com",
                 "Referer": "https://mp.weixin.qq.com"
             }
+        elif self.platform == "dy_search":
+            headers = {
+                'accept': '*/*',
+                'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8',
+                'priority': 'i',
+                'range': 'bytes=0-',
+                'referer': 'https://v11-coldf.douyinvod.com/',
+                'user-agent': FakeUserAgent().chrome
+            }
         else:
             headers = {}
         return headers
@@ -181,7 +199,7 @@ class AsyncETL(object):
         :param file_path:
         :return:
         """
-        headers = self.request_header()
+        headers = self.request_header(type_="cover")
         response = requests.get(url=self.cover_url, headers=headers)
         with open(file_path, "wb") as f:
             f.write(response.content)

+ 30 - 1
applications/functions/video_item.py

@@ -198,4 +198,33 @@ class VideoProducer(object):
         item.add_video_info("strategy", "search")
         item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
         mq_obj = item.produce_item()
-        return mq_obj
+        return mq_obj
+
+    @classmethod
+    def dy_video_producer(cls, video_obj, user, trace_id):
+        """
+        :param video_obj:
+        :param user:
+        :param trace_id:
+        :return:
+        """
+        platform = "dy_search"
+        publish_time_stamp = int(video_obj['publish_timestamp'] / 1000)
+        item = VideoItem()
+        item.add_video_info("user_id", user["uid"])
+        item.add_video_info("user_name", user["nick_name"])
+        item.add_video_info("video_id", video_obj['channel_content_id'])
+        item.add_video_info("video_title", trace_id)
+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
+        item.add_video_info("video_url", video_obj["video_url_list"][0]['video_url'])
+        item.add_video_info("cover_url", video_obj["image_url_list"][0]['image_url'])
+        item.add_video_info("out_video_id", video_obj['channel_content_id'])
+        item.add_video_info("play_cnt", video_obj['play_count'])
+        item.add_video_info("duration", video_obj["video_url_list"][0]['video_duration'])
+        item.add_video_info("like_cnt", video_obj['like_count'])
+        item.add_video_info("out_user_id", trace_id)
+        item.add_video_info("platform", platform)
+        item.add_video_info("strategy", "search")
+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
+        mq_obj = item.produce_item()
+        return mq_obj

+ 57 - 11
applications/schedule/search_schedule.py

@@ -48,20 +48,20 @@ class SearchABTest(object):
         """
         兜底策略
         """
-        result = await SearchMethod().search_v0(
+        result = await SearchMethod().search_v1(
             text=cls.article_keys[0],
             trace_id=cls.trace_id
         )
         if result:
             return result
         else:
-            sub_result = await SearchMethod().search_v0(
+            sub_result = await SearchMethod().search_v1(
                 text=cls.article_keys[1],
                 trace_id=cls.trace_id)
             if sub_result:
                 return sub_result
             else:
-                return await SearchMethod().search_v0(
+                return await SearchMethod().search_v1(
                     text=cls.article_keys[2],
                     trace_id=cls.trace_id
                 )
@@ -72,7 +72,7 @@ class SearchABTest(object):
         默认原标题搜索
         :return:
         """
-        search_result = await SearchMethod().search_v0(
+        search_result = await SearchMethod().search_v1(
             text=cls.ori_title,
             trace_id=cls.trace_id
         )
@@ -87,7 +87,7 @@ class SearchABTest(object):
         使用 content_summary搜索
         :return:
         """
-        search_result = await SearchMethod().search_v0(
+        search_result = await SearchMethod().search_v1(
             text=cls.article_summary,
             trace_id=cls.trace_id
         )
@@ -102,7 +102,7 @@ class SearchABTest(object):
         使用文本关键词搜索
         :return:
         """
-        search_result = await SearchMethod().search_v0(
+        search_result = await SearchMethod().search_v1(
             text=cls.article_keys[0],
             trace_id=cls.trace_id
         )
@@ -117,7 +117,7 @@ class SearchABTest(object):
         使用文本关键词搜索
         :return:
         """
-        search_result = await SearchMethod().search_v0(
+        search_result = await SearchMethod().search_v1(
             text=cls.article_keys[1],
             trace_id=cls.trace_id
         )
@@ -132,7 +132,7 @@ class SearchABTest(object):
         使用文本关键词搜索
         :return:
         """
-        search_result = await SearchMethod().search_v0(
+        search_result = await SearchMethod().search_v1(
             text=cls.article_keys[2],
             trace_id=cls.trace_id
         )
@@ -188,6 +188,46 @@ class SearchMethod(object):
                     )
                     return None
 
+    @classmethod
+    async def search_v1(cls, text, trace_id):
+        """
+        dy ---> baidu ---> xigua
+        :param text:
+        :param trace_id:
+        :return:
+        """
+        douyin_result = douyin_search(keyword=text, sensitive_words=cls.s_words)
+        if douyin_result:
+            return {"platform": "dy_search", "result": douyin_result[0]}
+        else:
+            logging(
+                code="7001",
+                info="抖音搜索失败--{}".format(text),
+                trace_id=trace_id
+            )
+            time.sleep(1)
+            baidu_result = hksp_search(key=text, sensitive_words=cls.s_words)
+            if baidu_result:
+                return {"platform": "baidu_search", "result": baidu_result[0]}
+            else:
+                # 若好看视频未搜到,则采用西瓜搜索
+                logging(
+                    code="7001",
+                    info="通过baidu搜索失败---{}".format(text),
+                    trace_id=trace_id,
+                )
+                # return None
+                xigua_result = xigua_search_v2(keyword=text, sensitive_words=cls.s_words)
+                if xigua_result:
+                    return {"platform": "xg_search", "result": xigua_result[0]}
+                else:
+                    logging(
+                        code="7001",
+                        info="通过西瓜搜索失败---{}, 启用兜底方式".format(text),
+                        trace_id=trace_id,
+                    )
+                    return None
+
 
 async def video_sender(video_obj, user, trace_id, platform):
     """
@@ -219,6 +259,12 @@ async def video_sender(video_obj, user, trace_id, platform):
             user=user,
             trace_id=trace_id,
         )
+    elif platform == "dy_search":
+        mq_obj = Video.dy_video_producer(
+            video_obj=video_obj,
+            user=user,
+            trace_id=trace_id,
+        )
     else:
         mq_obj = {}
     AE = AsyncETL(video_obj=mq_obj)
@@ -243,7 +289,7 @@ async def search_videos(params, trace_id, gh_id, mysql_client):
     """
     K = KimiServer()
     kimi_info = await K.search_kimi_schedule(params=params)
-    print(json.dumps(kimi_info, ensure_ascii=False, indent=4))
+    print("{}---kimi 挖掘正常".format(trace_id))
     kimi_title = kimi_info['k_title']
     content_title = kimi_info['content_title'].replace("'", "").replace('"', "")
     content_keys = json.dumps(kimi_info['content_keys'], ensure_ascii=False)
@@ -258,10 +304,12 @@ async def search_videos(params, trace_id, gh_id, mysql_client):
     kimi_info["trace_id"] = trace_id
     SearchAB = SearchABTest(info=kimi_info, gh_id=gh_id)
     recall_obj_1 = await SearchAB.ab_1()
+    # recall_obj_1 = await SearchAB.ab_0()
     await asyncio.sleep(3)
     recall_obj_2 = await SearchAB.ab_2()
     await asyncio.sleep(3)
     recall_obj_3 = await SearchAB.ab_3()
+    print("{}---视频搜索正常".format(trace_id))
     recall_list = [recall_obj_1, recall_obj_2, recall_obj_3]
     un_empty_list = [i for i in recall_list if i]
     if len(un_empty_list) < 3:
@@ -302,5 +350,3 @@ async def search_videos(params, trace_id, gh_id, mysql_client):
             info="视频搜索失败, 被敏感词过滤",
             trace_id=trace_id
         )
-
-

+ 2 - 1
applications/search/__init__.py

@@ -3,4 +3,5 @@
 """
 from .hksp_search import hksp_search
 from .weixin_search import wx_search
-from .xigua_search import xigua_search_v2
+from .xigua_search import xigua_search_v2
+from .dy_search import douyin_search

+ 63 - 0
applications/search/dy_search.py

@@ -0,0 +1,63 @@
+"""
+@author: luojunhui
+"""
+import json
+import requests
+
+from applications.functions.common import sensitive_flag
+
+
+def douyin_search(keyword, sensitive_words):
+    """
+    Search with dou cha cha
+    rank the relevance and recall the best three videos
+    :param sensitive_words: sensitive words in pq
+    :param keyword: the words needs to be searched
+    :return:
+    """
+    url = "http://8.217.190.241:8888/crawler/dou_yin/top_hub_content"
+    payload = json.dumps({
+        "keyword": keyword,
+        "category": "全部",
+        "period": "近7天",
+        "content_modal": "视频",
+        "cursor": ""
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+
+    response = requests.request("POST", url, headers=headers, data=payload)
+    dt_list = response.json()['data']['data']
+    L = []
+    for obj in dt_list:
+        try:
+            title = obj['video_desc']
+            video_id = obj['video_id']
+            if sensitive_flag(sensitive_words, title):
+                res = douyin_detail(video_id)
+                L.append(res)
+                return L
+            else:
+                continue
+        except Exception as e:
+            continue
+    return []
+
+
+def douyin_detail(video_id):
+    """
+    get video url address
+    :param video_id:
+    :return:
+    """
+    url = "http://8.217.190.241:8888/crawler/dou_yin/detail"
+    payload = json.dumps({
+        "content_id": video_id
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload).json()
+    video_info = response['data']['data']
+    return video_info