Sfoglia il codice sorgente

2024-06-06
上线了新etl, 目的是解决封面下载失败的问题

取消了自动加白名单的功能

罗俊辉 10 mesi fa
parent
commit
750222f692

+ 10 - 4
applications/functions/async_etl.py

@@ -1,7 +1,7 @@
 """
 @author: luojunhui
 """
-
+import json
 import os
 
 import oss2
@@ -219,9 +219,15 @@ class AsyncETL(object):
         oss_video = await upload_to_oss(
             local_video_path=file_path,
         )
-        oss_cover = await upload_to_oss(
-            local_video_path=cover_path
-        )
+        # 读取cover, 若img是html格式,则不上传
+        with open(cover_path, encoding="utf-8") as f:
+            img_data = f.read()
+        if "<html>" in img_data:
+            oss_cover = None
+        else:
+            oss_cover = await upload_to_oss(
+                local_video_path=cover_path
+            )
         # publish to pq
         result = await self.publish_by__request(
             video_path=oss_video,

+ 1 - 1
applications/functions/common.py

@@ -79,7 +79,7 @@ def create_gzh_path(video_id, shared_uid):
     source_id = "longArticles_" + generate_source_id()
     url = f"pages/user-videos?id={video_id}&su={shared_uid}&fromGzh=1&rootShareId={root_share_id}&shareId={root_share_id}&rootSourceId={source_id}"
     # 自动把 root_share_id 加入到白名单
-    auto_white(root_share_id)
+    # auto_white(root_share_id)
     return root_share_id, source_id, f"pages/category?jumpPage={urllib.parse.quote(url, safe='')}"
 
 

+ 70 - 0
applications/schedule/search_schedule.py

@@ -350,3 +350,73 @@ async def search_videos(params, trace_id, gh_id, mysql_client):
             info="视频搜索失败, 被敏感词过滤",
             trace_id=trace_id
         )
+
+
+async def re_search_videos(params, trace_id, gh_id, mysql_client):
+    """
+    重新搜索接口
+    :param params:
+    :param trace_id:
+    :param gh_id:
+    :param mysql_client:
+    :return:
+    cls.ori_title = info["ori_title"]
+    cls.article_summary = info["content_title"]
+    cls.article_keys = info["content_keys"]
+        cls.trace_id = info["trace_id"]
+    """
+    obj = {
+        "ori_title": params['ori_title'],
+        "content_title": params['kimi_summary'],
+        "content_keys": params['kimi_keys'],
+        "trace_id": params['trace_id']
+    }
+    SearchAB = SearchABTest(info=obj, gh_id=gh_id)
+    recall_obj_1 = await SearchAB.ab_1()
+    # recall_obj_1 = await SearchAB.ab_0()
+    await asyncio.sleep(3)
+    recall_obj_2 = await SearchAB.ab_2()
+    await asyncio.sleep(3)
+    recall_obj_3 = await SearchAB.ab_3()
+    print("{}---视频搜索正常".format(trace_id))
+    recall_list = [recall_obj_1, recall_obj_2, recall_obj_3]
+    un_empty_list = [i for i in recall_list if i]
+    if len(un_empty_list) < 3:
+        await asyncio.sleep(3)
+        recall_obj_4 = await SearchAB.ab_4()
+        if recall_obj_4:
+            un_empty_list.append(recall_obj_4)
+
+    # 逐条下载,逐条写表
+    if un_empty_list:
+        for index, recall_obj in enumerate(un_empty_list, 1):
+            platform = recall_obj["platform"]
+            recall_video = recall_obj["result"]
+            if recall_video:
+                logging(
+                    code="7002",
+                    info="视频搜索成功, 搜索平台为--{}".format(platform),
+                    trace_id=trace_id,
+                    data=recall_video,
+                )
+                video_id = await video_sender(
+                    video_obj=recall_video,
+                    user=gh_id_dict.get(gh_id),
+                    trace_id=trace_id,
+                    platform=platform,
+                )
+                update_id_sql = f"""
+                        UPDATE long_articles_video
+                        SET
+                        recall_video_id{index} = {video_id}
+                        WHERE
+                        trace_id = '{trace_id}'
+                        """
+                await mysql_client.async_insert(update_id_sql)
+    else:
+        logging(
+            code="7003",
+            info="视频搜索失败, 被敏感词过滤",
+            trace_id=trace_id
+        )
+

+ 7 - 7
applications/search/hksp_search.py

@@ -91,14 +91,14 @@ def hksp_search(key, sensitive_words):
         'x-requested-with': 'xmlhttprequest',
     }
     # 发送GET请求
-    response = requests.get(
-        url,
-        headers=headers,
-        params=params,
-        proxies=tunnel_proxies(),
-        timeout=120
-    ).json()
     try:
+        response = requests.get(
+            url,
+            headers=headers,
+            params=params,
+            proxies=tunnel_proxies(),
+            timeout=120
+        ).json()
         data_list = response['data']['list']
         L = []
         for data in data_list: