浏览代码

2024-06-11
优化重搜索逻辑
视频生成文章接口返回三条视频

罗俊辉 1 年之前
父节点
当前提交
ebc75f003b

+ 0 - 1
applications/functions/async_etl.py

@@ -1,7 +1,6 @@
 """
 @author: luojunhui
 """
-import json
 import os
 
 import oss2

+ 2 - 1
applications/functions/async_mysql.py

@@ -23,7 +23,8 @@ class AsyncMySQLClient(object):
             user='crawler',
             password='crawler123456@',
             db='piaoquan-crawler',
-            charset='utf8mb4'
+            charset='utf8mb4',
+            connect_timeout=120,
         )
         print("mysql init successfully")
 

+ 3 - 4
applications/routes.py

@@ -1,14 +1,13 @@
 """
 @author: luojunhui
 """
-import json
 import time
 import uuid
 import asyncio
 from quart import Blueprint, jsonify, request
 
 from applications.functions.log import logging
-from applications.schedule import recall_videos, search_videos, return_info_v2
+from applications.schedule import recall_videos, search_videos, re_search_videos
 
 my_blueprint = Blueprint('LongArticles', __name__)
 
@@ -132,7 +131,7 @@ def Routes(mysql_client):
         return jsonify(result)
 
     @my_blueprint.route('/re_search_videos', methods=['POST'])
-    async def re_search_videos():
+    async def ree_search_videos():
         """
         重新搜索
         :return:
@@ -149,7 +148,7 @@ def Routes(mysql_client):
         )
         try:
             asyncio.ensure_future(
-                search_videos(
+                re_search_videos(
                     params=params,
                     trace_id=trace_id,
                     gh_id=gh_id,

+ 1 - 1
applications/schedule/__init__.py

@@ -2,5 +2,5 @@
 @author: luojunhui
 """
 from .process_schedule import recall_videos
-from .search_schedule import search_videos
+from .search_schedule import search_videos, re_search_videos
 from .process_schedule import return_info_v2

+ 6 - 1
applications/schedule/process_schedule.py

@@ -160,13 +160,18 @@ async def recall_videos(trace_id, mysql_client):
                     mysql_client=mysql_client,
                     index=index
                 )
+                L.append(temp)
             else:
                 temp = await return_info_v2(
                     video_id=best_video_id,
                     trace_id=trace_id,
                     mysql_client=mysql_client
                 )
-            L.append(temp)
+                L.append(temp)
+                temp['paragraphPosition'] = 0.5
+                L.append(temp)
+                temp['paragraphPosition'] = 0.75
+                L.append(temp)
         result = {
             "traceId": trace_id,
             "miniprogramList": L

+ 111 - 51
applications/schedule/search_schedule.py

@@ -141,6 +141,46 @@ class SearchABTest(object):
         else:
             return await cls.base_line()
 
+    @classmethod
+    async def ab_5(cls):
+        """
+        增量搜索, 返回result_list
+        :return:
+        """
+        result_list = await SearchMethod().search_v2(
+            text=cls.article_summary[:15],
+            trace_id=cls.trace_id
+        )
+        if len(result_list) > 3:
+            return result_list
+        else:
+            result_list += await SearchMethod().search_v2(
+                text=cls.ori_title[:15],
+                trace_id=cls.trace_id
+            )
+            if len(result_list) > 3:
+                return result_list
+            else:
+                result_list += await SearchMethod().search_v2(
+                    text=cls.article_keys[0],
+                    trace_id=cls.trace_id
+                )
+                if len(result_list) > 3:
+                    return result_list
+                else:
+                    result_list += await SearchMethod().search_v2(
+                        text=cls.article_keys[1],
+                        trace_id=cls.trace_id
+                    )
+                    if result_list:
+                        return result_list
+                    else:
+                        result_list += await SearchMethod().search_v2(
+                            text=cls.article_keys[2],
+                            trace_id=cls.trace_id
+                        )
+                        return result_list
+
 
 class SearchMethod(object):
     """
@@ -228,6 +268,30 @@ class SearchMethod(object):
                     )
                     return None
 
+    @classmethod
+    async def search_v2(cls, text, trace_id):
+        """
+        dy ---> baidu ---> xigua
+        :param trace_id:
+        :param text:
+        :return:
+        """
+        L = []
+        print(trace_id)
+        douyin_result = douyin_search(keyword=text, sensitive_words=cls.s_words)
+        for vid_obj in douyin_result:
+            L.append({"platform": "dy_search", "result": vid_obj})
+        if len(L) >= 3:
+            return L
+        else:
+            baidu_result = hksp_search(key=text, sensitive_words=cls.s_words)
+            if baidu_result:
+                L.append({"platform": "baidu_search", "result": baidu_result[0]})
+            xigua_result = xigua_search_v2(keyword=text, sensitive_words=cls.s_words)
+            if xigua_result:
+                L.append({"platform": "xg_search", "result": xigua_result[0]})
+            return L
+
 
 async def video_sender(video_obj, user, trace_id, platform):
     """
@@ -271,9 +335,9 @@ async def video_sender(video_obj, user, trace_id, platform):
     video_id = await AE.etl_deal()
     logging(
         code="6002",
-        info="视频下载完成",
+        info="视频下载完成, 平台是---{}".format(platform),
         data=mq_obj,
-        trace_id=trace_id
+        trace_id=trace_id,
     )
     return video_id
 
@@ -352,6 +416,31 @@ async def search_videos(params, trace_id, gh_id, mysql_client):
         )
 
 
+async def insert_into_mysql(index, mysql_client, recall_video, gh_id, trace_id, platform):
+    """
+    :param platform:
+    :param trace_id:
+    :param gh_id:
+    :param index:
+    :param mysql_client:
+    :param recall_video:
+    """
+    video_id = await video_sender(
+        video_obj=recall_video,
+        user=gh_id_dict.get(gh_id),
+        trace_id=trace_id,
+        platform=platform,
+    )
+    update_id_sql = f"""
+        UPDATE long_articles_video
+        SET
+        recall_video_id{index} = {video_id}
+        WHERE
+        trace_id = '{trace_id}'
+    """
+    await mysql_client.async_insert(update_id_sql)
+
+
 async def re_search_videos(params, trace_id, gh_id, mysql_client):
     """
     重新搜索接口
@@ -360,63 +449,34 @@ async def re_search_videos(params, trace_id, gh_id, mysql_client):
     :param gh_id:
     :param mysql_client:
     :return:
-    cls.ori_title = info["ori_title"]
-    cls.article_summary = info["content_title"]
-    cls.article_keys = info["content_keys"]
-        cls.trace_id = info["trace_id"]
     """
     obj = {
-        "ori_title": params['ori_title'],
+        "ori_title": params['title'],
         "content_title": params['kimi_summary'],
         "content_keys": params['kimi_keys'],
         "trace_id": params['trace_id']
     }
     SearchAB = SearchABTest(info=obj, gh_id=gh_id)
-    recall_obj_1 = await SearchAB.ab_1()
-    # recall_obj_1 = await SearchAB.ab_0()
-    await asyncio.sleep(3)
-    recall_obj_2 = await SearchAB.ab_2()
-    await asyncio.sleep(3)
-    recall_obj_3 = await SearchAB.ab_3()
-    print("{}---视频搜索正常".format(trace_id))
-    recall_list = [recall_obj_1, recall_obj_2, recall_obj_3]
-    un_empty_list = [i for i in recall_list if i]
-    if len(un_empty_list) < 3:
-        await asyncio.sleep(3)
-        recall_obj_4 = await SearchAB.ab_4()
-        if recall_obj_4:
-            un_empty_list.append(recall_obj_4)
-
-    # 逐条下载,逐条写表
-    if un_empty_list:
-        for index, recall_obj in enumerate(un_empty_list, 1):
-            platform = recall_obj["platform"]
-            recall_video = recall_obj["result"]
+    # 启三个搜索,每个搜索都保证要搜索到, 分别用key1, key2, key3去搜索
+    recall_list = await SearchAB.ab_5()
+    print("一共搜索到{}条视频".format(len(recall_list)))
+    index = 0
+    for recall_obj in recall_list:
+        if recall_obj:
+            platform = recall_obj['platform']
+            recall_video = recall_obj['result']
             if recall_video:
-                logging(
-                    code="7002",
-                    info="视频搜索成功, 搜索平台为--{}".format(platform),
-                    trace_id=trace_id,
-                    data=recall_video,
-                )
-                video_id = await video_sender(
-                    video_obj=recall_video,
-                    user=gh_id_dict.get(gh_id),
+                index += 1
+                await insert_into_mysql(
+                    index=index,
+                    mysql_client=mysql_client,
+                    recall_video=recall_video,
+                    gh_id=gh_id,
                     trace_id=trace_id,
-                    platform=platform,
+                    platform=platform
                 )
-                update_id_sql = f"""
-                        UPDATE long_articles_video
-                        SET
-                        recall_video_id{index} = {video_id}
-                        WHERE
-                        trace_id = '{trace_id}'
-                        """
-                await mysql_client.async_insert(update_id_sql)
-    else:
-        logging(
-            code="7003",
-            info="视频搜索失败, 被敏感词过滤",
-            trace_id=trace_id
-        )
+                if index >= 3:
+                    print("already downloaded 3 videos")
+                    break
 
+    print("一个匹配到{}条文章".format(index))

+ 3 - 4
applications/search/dy_search.py

@@ -38,14 +38,13 @@ def douyin_search(keyword, sensitive_words):
                 if sensitive_flag(sensitive_words, title):
                     res = douyin_detail(video_id)
                     L.append(res)
-                    return L
                 else:
                     continue
             except Exception as e:
                 continue
-        return []
-    except:
-        print("search_fail---{}".format(keyword))
+        return L
+    except Exception as e:
+        print("search_fail---{}, error---{}".format(keyword, e))
         return []