Parcourir la source

2024-0514-搜索策略实验上线

罗俊辉 il y a 1 an
Parent
commit
37d603a213
3 fichiers modifiés avec 49 ajouts et 21 suppressions
  1. 27 0
      applications/schedule/search_schedule.py
  2. 21 20
      applications/search/xigua_search.py
  3. 1 1
      dev/test.py

+ 27 - 0
applications/schedule/search_schedule.py

@@ -63,6 +63,15 @@ class SearchABTest(object):
                     info="通过baidu搜索失败---{}".format(cls.ori_title),
                     trace_id=cls.trace_id,
                 )
+                xigua_result = xigua_search(keyword=cls.ori_title)
+                if xigua_result:
+                    return {"platform": "xg_search", "result": xigua_result[0]}
+                else:
+                    logging(
+                        code="7001",
+                        info="通过西瓜搜索失败---{}".format(cls.ori_title),
+                        trace_id=cls.trace_id,
+                    )
                 return None
 
     @classmethod
@@ -90,6 +99,15 @@ class SearchABTest(object):
                     info="通过baidu搜索失败---{}".format(cls.article_summary),
                     trace_id=cls.trace_id,
                 )
+                xigua_result = xigua_search(keyword=cls.article_summary)
+                if xigua_result:
+                    return {"platform": "xg_search", "result": xigua_result[0]}
+                else:
+                    logging(
+                        code="7001",
+                        info="通过西瓜搜索失败---{}".format(cls.article_summary),
+                        trace_id=cls.trace_id,
+                    )
                 return None
 
     @classmethod
@@ -118,6 +136,15 @@ class SearchABTest(object):
                     info="通过baidu搜索失败---{}".format(",".join(cls.article_keys)),
                     trace_id=cls.trace_id,
                 )
+                xigua_result = xigua_search(keyword=",".join(cls.article_keys))
+                if xigua_result:
+                    return {"platform": "xg_search", "result": xigua_result[0]}
+                else:
+                    logging(
+                        code="7001",
+                        info="通过西瓜搜索失败---{}".format(",".join(cls.article_keys)),
+                        trace_id=cls.trace_id,
+                    )
                 return None
 
     @classmethod

+ 21 - 20
applications/search/xigua_search.py

@@ -5,6 +5,7 @@
 import re
 import json
 import base64
+
 import requests
 import urllib.parse
 
@@ -201,25 +202,25 @@ def xigua_search(keyword):
     }
     basic_response = requests.get(url=base_url, headers=headers)
     html = etree.HTML(basic_response.text)
-    result = html.xpath(
-        '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
+    result_list = html.xpath(
+        '//div[@class="HorizontalFeedCard searchPageV2__card"]/div[1]/a'
     )
-    if result:
-        L = []
-        doc_id_list = [page_id[1:] for page_id in result]
-        for doc_id in doc_id_list:
-            try:
-                video_d = XiGuaFunctions().get_video_info(doc_id)
-                video_title = video_d['video_title']
-                if sensitive_flag(sensitive_words, video_title) and int(video_d['duration']) <= 300:
-                    print("西瓜视频信息")
-                    print(video_d)
-                    L.append(video_d)
-                else:
-                    continue
-            except Exception as e:
-                print(e)
-                continue
-        return L
-    else:
+    if result_list:
+        for item in result_list:
+            url = item.xpath("@href")[0]
+            duration_str = str(item.xpath("./span/text()")[0])
+            duration = int(duration_str.split(":")[0]) * 60 + int(duration_str.split(":")[1])
+            title = item.xpath("@title")[0]
+            real_title = bytes(str(title), "latin1").decode()
+            if sensitive_flag(sensitive_words, real_title) and duration <= 300:
+                try:
+                    res = XiGuaFunctions().get_video_info(url[1:])
+                    if res:
+                        return [res]
+                    else:
+                        continue
+                except Exception as e:
+                    print(e)
         return []
+    else:
+        return []

+ 1 - 1
dev/test.py

@@ -11,7 +11,7 @@ body = {
     "cover": "http://mmbiz.qpic.cn/mmbiz_jpg/DAIqn771G5OegJt9iacR7k7Gxgs0ic6Vg0fUtxmicJzxoBWicibQciaQkhgXzsyJqnfETRibGHMwZzsCkSuVXb16xdMTg/0?from=appmsg",
     # "ghId": "gh_b15de7c99912", # ab2
     "ghId": "gh_a2901d34f75b", # ab1
-    "title": "加沙地带多家医院周围地区遭空袭!巴以冲突最新消息",
+    "title": "加@@沙地带多家医院周围地区遭空袭!巴以冲突最新消息",
     # "traceId": "search-4a246d27-740d-4581-bbd9-48eb48dfcee3-1715685085",
     # "kimi_title": "🔴紧急!加沙地带医院遭空袭,巴以冲突最新动态!🔥",
     # "videoList": [20738863]