Forráskód Böngészése

2024-06-11
优化重搜索逻辑
视频生成文章接口返回三条视频

罗俊辉 1 éve
szülő
commit
ebc75f003b

+ 0 - 1
applications/functions/async_etl.py

@@ -1,7 +1,6 @@
 """
 """
 @author: luojunhui
 @author: luojunhui
 """
 """
-import json
 import os
 import os
 
 
 import oss2
 import oss2

+ 2 - 1
applications/functions/async_mysql.py

@@ -23,7 +23,8 @@ class AsyncMySQLClient(object):
             user='crawler',
             user='crawler',
             password='crawler123456@',
             password='crawler123456@',
             db='piaoquan-crawler',
             db='piaoquan-crawler',
-            charset='utf8mb4'
+            charset='utf8mb4',
+            connect_timeout=120,
         )
         )
         print("mysql init successfully")
         print("mysql init successfully")
 
 

+ 3 - 4
applications/routes.py

@@ -1,14 +1,13 @@
 """
 """
 @author: luojunhui
 @author: luojunhui
 """
 """
-import json
 import time
 import time
 import uuid
 import uuid
 import asyncio
 import asyncio
 from quart import Blueprint, jsonify, request
 from quart import Blueprint, jsonify, request
 
 
 from applications.functions.log import logging
 from applications.functions.log import logging
-from applications.schedule import recall_videos, search_videos, return_info_v2
+from applications.schedule import recall_videos, search_videos, re_search_videos
 
 
 my_blueprint = Blueprint('LongArticles', __name__)
 my_blueprint = Blueprint('LongArticles', __name__)
 
 
@@ -132,7 +131,7 @@ def Routes(mysql_client):
         return jsonify(result)
         return jsonify(result)
 
 
     @my_blueprint.route('/re_search_videos', methods=['POST'])
     @my_blueprint.route('/re_search_videos', methods=['POST'])
-    async def re_search_videos():
+    async def ree_search_videos():
         """
         """
         重新搜索
         重新搜索
         :return:
         :return:
@@ -149,7 +148,7 @@ def Routes(mysql_client):
         )
         )
         try:
         try:
             asyncio.ensure_future(
             asyncio.ensure_future(
-                search_videos(
+                re_search_videos(
                     params=params,
                     params=params,
                     trace_id=trace_id,
                     trace_id=trace_id,
                     gh_id=gh_id,
                     gh_id=gh_id,

+ 1 - 1
applications/schedule/__init__.py

@@ -2,5 +2,5 @@
 @author: luojunhui
 @author: luojunhui
 """
 """
 from .process_schedule import recall_videos
 from .process_schedule import recall_videos
-from .search_schedule import search_videos
+from .search_schedule import search_videos, re_search_videos
 from .process_schedule import return_info_v2
 from .process_schedule import return_info_v2

+ 6 - 1
applications/schedule/process_schedule.py

@@ -160,13 +160,18 @@ async def recall_videos(trace_id, mysql_client):
                     mysql_client=mysql_client,
                     mysql_client=mysql_client,
                     index=index
                     index=index
                 )
                 )
+                L.append(temp)
             else:
             else:
                 temp = await return_info_v2(
                 temp = await return_info_v2(
                     video_id=best_video_id,
                     video_id=best_video_id,
                     trace_id=trace_id,
                     trace_id=trace_id,
                     mysql_client=mysql_client
                     mysql_client=mysql_client
                 )
                 )
-            L.append(temp)
+                L.append(temp)
+                temp['paragraphPosition'] = 0.5
+                L.append(temp)
+                temp['paragraphPosition'] = 0.75
+                L.append(temp)
         result = {
         result = {
             "traceId": trace_id,
             "traceId": trace_id,
             "miniprogramList": L
             "miniprogramList": L

+ 111 - 51
applications/schedule/search_schedule.py

@@ -141,6 +141,46 @@ class SearchABTest(object):
         else:
         else:
             return await cls.base_line()
             return await cls.base_line()
 
 
+    @classmethod
+    async def ab_5(cls):
+        """
+        增量搜索, 返回result_list
+        :return:
+        """
+        result_list = await SearchMethod().search_v2(
+            text=cls.article_summary[:15],
+            trace_id=cls.trace_id
+        )
+        if len(result_list) > 3:
+            return result_list
+        else:
+            result_list += await SearchMethod().search_v2(
+                text=cls.ori_title[:15],
+                trace_id=cls.trace_id
+            )
+            if len(result_list) > 3:
+                return result_list
+            else:
+                result_list += await SearchMethod().search_v2(
+                    text=cls.article_keys[0],
+                    trace_id=cls.trace_id
+                )
+                if len(result_list) > 3:
+                    return result_list
+                else:
+                    result_list += await SearchMethod().search_v2(
+                        text=cls.article_keys[1],
+                        trace_id=cls.trace_id
+                    )
+                    if result_list:
+                        return result_list
+                    else:
+                        result_list += await SearchMethod().search_v2(
+                            text=cls.article_keys[2],
+                            trace_id=cls.trace_id
+                        )
+                        return result_list
+
 
 
 class SearchMethod(object):
 class SearchMethod(object):
     """
     """
@@ -228,6 +268,30 @@ class SearchMethod(object):
                     )
                     )
                     return None
                     return None
 
 
+    @classmethod
+    async def search_v2(cls, text, trace_id):
+        """
+        dy ---> baidu ---> xigua
+        :param trace_id:
+        :param text:
+        :return:
+        """
+        L = []
+        print(trace_id)
+        douyin_result = douyin_search(keyword=text, sensitive_words=cls.s_words)
+        for vid_obj in douyin_result:
+            L.append({"platform": "dy_search", "result": vid_obj})
+        if len(L) >= 3:
+            return L
+        else:
+            baidu_result = hksp_search(key=text, sensitive_words=cls.s_words)
+            if baidu_result:
+                L.append({"platform": "baidu_search", "result": baidu_result[0]})
+            xigua_result = xigua_search_v2(keyword=text, sensitive_words=cls.s_words)
+            if xigua_result:
+                L.append({"platform": "xg_search", "result": xigua_result[0]})
+            return L
+
 
 
 async def video_sender(video_obj, user, trace_id, platform):
 async def video_sender(video_obj, user, trace_id, platform):
     """
     """
@@ -271,9 +335,9 @@ async def video_sender(video_obj, user, trace_id, platform):
     video_id = await AE.etl_deal()
     video_id = await AE.etl_deal()
     logging(
     logging(
         code="6002",
         code="6002",
-        info="视频下载完成",
+        info="视频下载完成, 平台是---{}".format(platform),
         data=mq_obj,
         data=mq_obj,
-        trace_id=trace_id
+        trace_id=trace_id,
     )
     )
     return video_id
     return video_id
 
 
@@ -352,6 +416,31 @@ async def search_videos(params, trace_id, gh_id, mysql_client):
         )
         )
 
 
 
 
+async def insert_into_mysql(index, mysql_client, recall_video, gh_id, trace_id, platform):
+    """
+    :param platform:
+    :param trace_id:
+    :param gh_id:
+    :param index:
+    :param mysql_client:
+    :param recall_video:
+    """
+    video_id = await video_sender(
+        video_obj=recall_video,
+        user=gh_id_dict.get(gh_id),
+        trace_id=trace_id,
+        platform=platform,
+    )
+    update_id_sql = f"""
+        UPDATE long_articles_video
+        SET
+        recall_video_id{index} = {video_id}
+        WHERE
+        trace_id = '{trace_id}'
+    """
+    await mysql_client.async_insert(update_id_sql)
+
+
 async def re_search_videos(params, trace_id, gh_id, mysql_client):
 async def re_search_videos(params, trace_id, gh_id, mysql_client):
     """
     """
     重新搜索接口
     重新搜索接口
@@ -360,63 +449,34 @@ async def re_search_videos(params, trace_id, gh_id, mysql_client):
     :param gh_id:
     :param gh_id:
     :param mysql_client:
     :param mysql_client:
     :return:
     :return:
-    cls.ori_title = info["ori_title"]
-    cls.article_summary = info["content_title"]
-    cls.article_keys = info["content_keys"]
-        cls.trace_id = info["trace_id"]
     """
     """
     obj = {
     obj = {
-        "ori_title": params['ori_title'],
+        "ori_title": params['title'],
         "content_title": params['kimi_summary'],
         "content_title": params['kimi_summary'],
         "content_keys": params['kimi_keys'],
         "content_keys": params['kimi_keys'],
         "trace_id": params['trace_id']
         "trace_id": params['trace_id']
     }
     }
     SearchAB = SearchABTest(info=obj, gh_id=gh_id)
     SearchAB = SearchABTest(info=obj, gh_id=gh_id)
-    recall_obj_1 = await SearchAB.ab_1()
-    # recall_obj_1 = await SearchAB.ab_0()
-    await asyncio.sleep(3)
-    recall_obj_2 = await SearchAB.ab_2()
-    await asyncio.sleep(3)
-    recall_obj_3 = await SearchAB.ab_3()
-    print("{}---视频搜索正常".format(trace_id))
-    recall_list = [recall_obj_1, recall_obj_2, recall_obj_3]
-    un_empty_list = [i for i in recall_list if i]
-    if len(un_empty_list) < 3:
-        await asyncio.sleep(3)
-        recall_obj_4 = await SearchAB.ab_4()
-        if recall_obj_4:
-            un_empty_list.append(recall_obj_4)
-
-    # 逐条下载,逐条写表
-    if un_empty_list:
-        for index, recall_obj in enumerate(un_empty_list, 1):
-            platform = recall_obj["platform"]
-            recall_video = recall_obj["result"]
+    # 启三个搜索,每个搜索都保证要搜索到, 分别用key1, key2, key3去搜索
+    recall_list = await SearchAB.ab_5()
+    print("一共搜索到{}条视频".format(len(recall_list)))
+    index = 0
+    for recall_obj in recall_list:
+        if recall_obj:
+            platform = recall_obj['platform']
+            recall_video = recall_obj['result']
             if recall_video:
             if recall_video:
-                logging(
-                    code="7002",
-                    info="视频搜索成功, 搜索平台为--{}".format(platform),
-                    trace_id=trace_id,
-                    data=recall_video,
-                )
-                video_id = await video_sender(
-                    video_obj=recall_video,
-                    user=gh_id_dict.get(gh_id),
+                index += 1
+                await insert_into_mysql(
+                    index=index,
+                    mysql_client=mysql_client,
+                    recall_video=recall_video,
+                    gh_id=gh_id,
                     trace_id=trace_id,
                     trace_id=trace_id,
-                    platform=platform,
+                    platform=platform
                 )
                 )
-                update_id_sql = f"""
-                        UPDATE long_articles_video
-                        SET
-                        recall_video_id{index} = {video_id}
-                        WHERE
-                        trace_id = '{trace_id}'
-                        """
-                await mysql_client.async_insert(update_id_sql)
-    else:
-        logging(
-            code="7003",
-            info="视频搜索失败, 被敏感词过滤",
-            trace_id=trace_id
-        )
+                if index >= 3:
+                    print("already downloaded 3 videos")
+                    break
 
 
+    print("一个匹配到{}条文章".format(index))

+ 3 - 4
applications/search/dy_search.py

@@ -38,14 +38,13 @@ def douyin_search(keyword, sensitive_words):
                 if sensitive_flag(sensitive_words, title):
                 if sensitive_flag(sensitive_words, title):
                     res = douyin_detail(video_id)
                     res = douyin_detail(video_id)
                     L.append(res)
                     L.append(res)
-                    return L
                 else:
                 else:
                     continue
                     continue
             except Exception as e:
             except Exception as e:
                 continue
                 continue
-        return []
-    except:
-        print("search_fail---{}".format(keyword))
+        return L
+    except Exception as e:
+        print("search_fail---{}, error---{}".format(keyword, e))
         return []
         return []