Selaa lähdekoodia

2024-06-17
标题该用原标题

罗俊辉 10 kuukautta sitten
vanhempi
commit
8e92df7bbd
2 muutettua tiedostoa jossa 26 lisäystä ja 63 poistoa
  1. 7 2
      applications/functions/video_item.py
  2. 19 61
      applications/search/xigua_search.py

+ 7 - 2
applications/functions/video_item.py

@@ -1,6 +1,7 @@
 """
 @author: luojunhui
 """
+import json
 import time
 
 from applications.functions.common import clean_title
@@ -152,10 +153,12 @@ class VideoProducer(object):
         platform = "baidu_search"
         publish_time_stamp = int(video_obj['publish_time'])
         item = VideoItem()
+        print("baidu")
+        print(json.dumps(video_obj, ensure_ascii=False, indent=4))
         item.add_video_info("user_id", user["uid"])
         item.add_video_info("user_name", user["nick_name"])
         item.add_video_info("video_id", video_obj['id'])
-        item.add_video_info("video_title", video_obj.get('title', trace_id))
+        item.add_video_info("video_title", video_obj['title'])
         item.add_video_info("publish_time_stamp", publish_time_stamp)
         item.add_video_info("video_url", video_obj["playurl"])
         item.add_video_info("cover_url", video_obj["poster"])
@@ -211,10 +214,12 @@ class VideoProducer(object):
         platform = "dy_search"
         publish_time_stamp = int(video_obj['publish_timestamp'] / 1000)
         item = VideoItem()
+        print("douyin")
+        print(json.dumps(video_obj, ensure_ascii=False, indent=4))
         item.add_video_info("user_id", user["uid"])
         item.add_video_info("user_name", user["nick_name"])
         item.add_video_info("video_id", video_obj['channel_content_id'])
-        item.add_video_info("video_title", video_obj.get('title', trace_id))
+        item.add_video_info("video_title", video_obj['title'])
         item.add_video_info("publish_time_stamp", int(publish_time_stamp))
         item.add_video_info("video_url", video_obj["video_url_list"][0]['video_url'])
         item.add_video_info("cover_url", video_obj["image_url_list"][0]['image_url'])

+ 19 - 61
applications/search/xigua_search.py

@@ -156,70 +156,28 @@ class XiGuaFunctions(object):
         """
         url = "https://www.ixigua.com/{}".format(item_id)
         headers = {
-            "accept-encoding": "gzip, deflate",
+            # "accept-encoding": "gzip, deflate",
             "accept-language": "zh-CN,zh-Hans;q=0.9",
-            "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
-            "user-agent": FakeUserAgent().random,
+            # "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
+            "cookie": "UIFID=73355a799e41c2edb6d004baa6cda0116425031dff9117e11075ec8bf266082874fe897f43e66be83a0501afe4a08cfc7e1066ab88423af122641493c7af9f0a745eb85c50fddb096de5cc77cd5ff05503312d84d36ab2681c6e6d930bbe68edaebf8fae03b04eb669359965e01c266b;"
+                      "__ac_nonce=0666fd1a00053bf535b9f;"
+                      "__ac_signature=_02B4Z6wo00f01u8PTiQAAIDBvfBuP-YjUQbvL0qAAN25bWfWXQrzRNCBKvFYKS5wAOYPXg5XV1Ck9JEroeWeWKijH2v3i4lxXM37JogiJJfEtYD.8sbXul2-4v.VRRta4xa07ignRnGj5Voh83;"
+            # "msToken=Pc0sCOhbTxWnGbeqIHMcELMObmtTQGPwloqzOwtfsew-ao5WYnHuhKwE4TL_-88EGh64ec36ggsuqMuV-iBmcF1Gg92ZDGlD89lL6r0MMCg-8srTh1GfNgDnVfFq7g==; "
+            # "tt_scid=wLMuzIiixDpWtXV38R283kz.YIi2x1BE31RggCRLCsFJu204SFWS8Py13xxEPpzZ3b8e;"
+                      "ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1718605316%7C9dfc9322350e713e6109ed46a7047ed31c0ab5a724e84de0bb766c195043207c",
+            "user-agent": FakeUserAgent().chrome,
             "referer": "https://www.ixigua.com/{}/".format(item_id),
         }
         response = requests.get(
             url=url,
             headers=headers
         )
+        print(response.text)
         video_info = cls.extract_info_by_re(response.text)
         return video_info
 
 
-def xigua_search(keyword, sensitive_words):
-    """
-    搜索
-    """
-    keyword = urllib.parse.quote(keyword)
-    base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
-        keyword
-    )
-    headers = {
-        "authority": "www.ixigua.com",
-        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-        "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
-        "cache-control": "max-age=0",
-        "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
-        "upgrade-insecure-requests": "1",
-        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
-    }
-    basic_response = requests.get(url=base_url, headers=headers)
-    html = etree.HTML(basic_response.text)
-    result_list = html.xpath(
-        '//div[@class="HorizontalFeedCard searchPageV2__card"]/div[1]/a'
-    )
-    if result_list:
-        for item in result_list:
-            try:
-                url = item.xpath("@href")[0]
-                duration_str = str(item.xpath("./span/text()")[0])
-                duration_obj = duration_str.split(":")
-                if len(duration_obj) == 3:
-                    duration = 100000
-                elif len(duration_obj) == 2:
-                    duration = int(duration_str.split(":")[0]) * 60 + int(duration_str.split(":")[1])
-                else:
-                    duration = 10000
-                title = item.xpath("@title")[0]
-                real_title = bytes(str(title), "latin1").decode()
-                if sensitive_flag(sensitive_words, real_title) and duration <= 300:
-                    try:
-                        res = XiGuaFunctions().get_video_info(url[1:])
-                        if res:
-                            return [res]
-                        else:
-                            continue
-                    except Exception as e:
-                        print(e)
-            except Exception as e:
-                print(e)
-        return []
-    else:
-        return []
+# class XiGuaVideoDeal(object):
 
 
 def xigua_search_v2(keyword, sensitive_words):
@@ -250,14 +208,14 @@ def xigua_search_v2(keyword, sensitive_words):
                     duration = obj['data']['video_time']
                     watch_count = obj['data']['video_watch_count']
                     if sensitive_flag(sensitive_words, title) and duration <= 300:
-                        try:
-                            res = XiGuaFunctions().get_video_info(url)
-                            if res:
-                                return [res]
-                            else:
-                                continue
-                        except Exception as e:
-                            print(e)
+                        # try:
+                        res = XiGuaFunctions().get_video_info(url)
+                        if res:
+                            return [res]
+                        else:
+                            continue
+                        # except Exception as e:
+                        #     print(e)
             return []
         else:
             return []