|
@@ -0,0 +1,711 @@
|
|
|
+"""
|
|
|
+西瓜视频搜索爬虫
|
|
|
+"""
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import json
|
|
|
+import time
|
|
|
+import uuid
|
|
|
+import random
|
|
|
+import base64
|
|
|
+import asyncio
|
|
|
+import aiohttp
|
|
|
+import urllib.parse
|
|
|
+
|
|
|
+import requests
|
|
|
+from lxml import etree
|
|
|
+
|
|
|
+sys.path.append(os.getcwd())
|
|
|
+
|
|
|
+from application.items import VideoItem
|
|
|
+from application.pipeline import PiaoQuanPipeline
|
|
|
+from application.common.messageQueue import MQ
|
|
|
+from application.common.proxies import tunnel_proxies
|
|
|
+from application.common.log import AliyunLogger
|
|
|
+
|
|
|
+
|
|
|
+def get_video_url(video_info):
|
|
|
+ """
|
|
|
+ 获取视频链接信息
|
|
|
+ :param video_info:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ video_url_dict = {}
|
|
|
+ # video_url
|
|
|
+ if "videoResource" not in video_info:
|
|
|
+ video_url_dict["video_url"] = ""
|
|
|
+ video_url_dict["audio_url"] = ""
|
|
|
+ video_url_dict["video_width"] = 0
|
|
|
+ video_url_dict["video_height"] = 0
|
|
|
+
|
|
|
+ elif "dash_120fps" in video_info["videoResource"]:
|
|
|
+ if (
|
|
|
+ "video_list" in video_info["videoResource"]["dash_120fps"]
|
|
|
+ and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_4"
|
|
|
+ ]["backup_url_1"]
|
|
|
+ audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_4"
|
|
|
+ ]["backup_url_1"]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_4"
|
|
|
+ ]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_4"
|
|
|
+ ]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["dash_120fps"]
|
|
|
+ and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_3"
|
|
|
+ ]["backup_url_1"]
|
|
|
+ audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_3"
|
|
|
+ ]["backup_url_1"]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_3"
|
|
|
+ ]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_3"
|
|
|
+ ]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["dash_120fps"]
|
|
|
+ and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_2"
|
|
|
+ ]["backup_url_1"]
|
|
|
+ audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_2"
|
|
|
+ ]["backup_url_1"]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_2"
|
|
|
+ ]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_2"
|
|
|
+ ]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["dash_120fps"]
|
|
|
+ and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_1"
|
|
|
+ ]["backup_url_1"]
|
|
|
+ audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_1"
|
|
|
+ ]["backup_url_1"]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_1"
|
|
|
+ ]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
|
|
|
+ "video_1"
|
|
|
+ ]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+
|
|
|
+ elif (
|
|
|
+ "dynamic_video" in video_info["videoResource"]["dash_120fps"]
|
|
|
+ and "dynamic_video_list"
|
|
|
+ in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
|
|
|
+ and "dynamic_audio_list"
|
|
|
+ in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
|
|
|
+ and len(
|
|
|
+ video_info["videoResource"]["dash_120fps"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ != 0
|
|
|
+ and len(
|
|
|
+ video_info["videoResource"]["dash_120fps"]["dynamic_video"][
|
|
|
+ "dynamic_audio_list"
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ != 0
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["backup_url_1"]
|
|
|
+ audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
|
|
|
+ "dynamic_audio_list"
|
|
|
+ ][-1]["backup_url_1"]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ else:
|
|
|
+ video_url_dict["video_url"] = ""
|
|
|
+ video_url_dict["audio_url"] = ""
|
|
|
+ video_url_dict["video_width"] = 0
|
|
|
+ video_url_dict["video_height"] = 0
|
|
|
+
|
|
|
+ elif "dash" in video_info["videoResource"]:
|
|
|
+ if (
|
|
|
+ "video_list" in video_info["videoResource"]["dash"]
|
|
|
+ and "video_4" in video_info["videoResource"]["dash"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
|
|
|
+ "vwidth"
|
|
|
+ ]
|
|
|
+ video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
|
|
|
+ "vheight"
|
|
|
+ ]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["dash"]
|
|
|
+ and "video_3" in video_info["videoResource"]["dash"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
|
|
|
+ "vwidth"
|
|
|
+ ]
|
|
|
+ video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
|
|
|
+ "vheight"
|
|
|
+ ]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["dash"]
|
|
|
+ and "video_2" in video_info["videoResource"]["dash"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
|
|
|
+ "vwidth"
|
|
|
+ ]
|
|
|
+ video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
|
|
|
+ "vheight"
|
|
|
+ ]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["dash"]
|
|
|
+ and "video_1" in video_info["videoResource"]["dash"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
|
|
|
+ "vwidth"
|
|
|
+ ]
|
|
|
+ video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
|
|
|
+ "vheight"
|
|
|
+ ]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+
|
|
|
+ elif (
|
|
|
+ "dynamic_video" in video_info["videoResource"]["dash"]
|
|
|
+ and "dynamic_video_list"
|
|
|
+ in video_info["videoResource"]["dash"]["dynamic_video"]
|
|
|
+ and "dynamic_audio_list"
|
|
|
+ in video_info["videoResource"]["dash"]["dynamic_video"]
|
|
|
+ and len(
|
|
|
+ video_info["videoResource"]["dash"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ != 0
|
|
|
+ and len(
|
|
|
+ video_info["videoResource"]["dash"]["dynamic_video"][
|
|
|
+ "dynamic_audio_list"
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ != 0
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["dash"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["backup_url_1"]
|
|
|
+ audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
|
|
|
+ "dynamic_audio_list"
|
|
|
+ ][-1]["backup_url_1"]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["dash"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["dash"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ else:
|
|
|
+ video_url_dict["video_url"] = ""
|
|
|
+ video_url_dict["audio_url"] = ""
|
|
|
+ video_url_dict["video_width"] = 0
|
|
|
+ video_url_dict["video_height"] = 0
|
|
|
+
|
|
|
+ elif "normal" in video_info["videoResource"]:
|
|
|
+ if (
|
|
|
+ "video_list" in video_info["videoResource"]["normal"]
|
|
|
+ and "video_4" in video_info["videoResource"]["normal"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["normal"]["video_list"][
|
|
|
+ "video_4"
|
|
|
+ ]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["normal"]["video_list"][
|
|
|
+ "video_4"
|
|
|
+ ]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["normal"]
|
|
|
+ and "video_3" in video_info["videoResource"]["normal"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["normal"]["video_list"][
|
|
|
+ "video_3"
|
|
|
+ ]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["normal"]["video_list"][
|
|
|
+ "video_3"
|
|
|
+ ]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["normal"]
|
|
|
+ and "video_2" in video_info["videoResource"]["normal"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["normal"]["video_list"][
|
|
|
+ "video_2"
|
|
|
+ ]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["normal"]["video_list"][
|
|
|
+ "video_2"
|
|
|
+ ]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ elif (
|
|
|
+ "video_list" in video_info["videoResource"]["normal"]
|
|
|
+ and "video_1" in video_info["videoResource"]["normal"]["video_list"]
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
|
|
|
+ "backup_url_1"
|
|
|
+ ]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["normal"]["video_list"][
|
|
|
+ "video_1"
|
|
|
+ ]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["normal"]["video_list"][
|
|
|
+ "video_1"
|
|
|
+ ]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+
|
|
|
+ elif (
|
|
|
+ "dynamic_video" in video_info["videoResource"]["normal"]
|
|
|
+ and "dynamic_video_list"
|
|
|
+ in video_info["videoResource"]["normal"]["dynamic_video"]
|
|
|
+ and "dynamic_audio_list"
|
|
|
+ in video_info["videoResource"]["normal"]["dynamic_video"]
|
|
|
+ and len(
|
|
|
+ video_info["videoResource"]["normal"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ != 0
|
|
|
+ and len(
|
|
|
+ video_info["videoResource"]["normal"]["dynamic_video"][
|
|
|
+ "dynamic_audio_list"
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ != 0
|
|
|
+ ):
|
|
|
+ video_url = video_info["videoResource"]["normal"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["backup_url_1"]
|
|
|
+ audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
|
|
|
+ "dynamic_audio_list"
|
|
|
+ ][-1]["backup_url_1"]
|
|
|
+ if len(video_url) % 3 == 1:
|
|
|
+ video_url += "=="
|
|
|
+ elif len(video_url) % 3 == 2:
|
|
|
+ video_url += "="
|
|
|
+ elif len(audio_url) % 3 == 1:
|
|
|
+ audio_url += "=="
|
|
|
+ elif len(audio_url) % 3 == 2:
|
|
|
+ audio_url += "="
|
|
|
+ video_url = base64.b64decode(video_url).decode("utf8")
|
|
|
+ audio_url = base64.b64decode(audio_url).decode("utf8")
|
|
|
+ video_width = video_info["videoResource"]["normal"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["vwidth"]
|
|
|
+ video_height = video_info["videoResource"]["normal"]["dynamic_video"][
|
|
|
+ "dynamic_video_list"
|
|
|
+ ][-1]["vheight"]
|
|
|
+ video_url_dict["video_url"] = video_url
|
|
|
+ video_url_dict["audio_url"] = audio_url
|
|
|
+ video_url_dict["video_width"] = video_width
|
|
|
+ video_url_dict["video_height"] = video_height
|
|
|
+ else:
|
|
|
+ video_url_dict["video_url"] = ""
|
|
|
+ video_url_dict["audio_url"] = ""
|
|
|
+ video_url_dict["video_width"] = 0
|
|
|
+ video_url_dict["video_height"] = 0
|
|
|
+
|
|
|
+ else:
|
|
|
+ video_url_dict["video_url"] = ""
|
|
|
+ video_url_dict["audio_url"] = ""
|
|
|
+ video_url_dict["video_width"] = 0
|
|
|
+ video_url_dict["video_height"] = 0
|
|
|
+
|
|
|
+ return video_url_dict
|
|
|
+
|
|
|
+
|
|
|
+class XiGuaSearch(object):
|
|
|
+ """
|
|
|
+ XiGuaSearch
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
|
|
|
+ self.platform = platform
|
|
|
+ self.mode = mode
|
|
|
+ self.rule_dict = rule_dict
|
|
|
+ self.user_list = user_list
|
|
|
+ self.env = env
|
|
|
+ self.download_cnt = 0
|
|
|
+ self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
+ self.expire_flag = False
|
|
|
+ self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
|
|
|
+
|
|
|
+ async def search(self, keyword):
|
|
|
+ """搜索"""
|
|
|
+ keyword = urllib.parse.quote(keyword)
|
|
|
+ base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
|
|
|
+ keyword
|
|
|
+ )
|
|
|
+ headers = {
|
|
|
+ "authority": "www.ixigua.com",
|
|
|
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
+ "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
|
|
|
+ "cache-control": "max-age=0",
|
|
|
+ "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
|
|
|
+ "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
|
|
+ "sec-ch-ua-mobile": "?0",
|
|
|
+ "sec-ch-ua-platform": '"macOS"',
|
|
|
+ "sec-fetch-dest": "document",
|
|
|
+ "sec-fetch-mode": "navigate",
|
|
|
+ "sec-fetch-site": "none",
|
|
|
+ "sec-fetch-user": "?1",
|
|
|
+ "upgrade-insecure-requests": "1",
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
|
+ }
|
|
|
+ basic_response = requests.get(url=base_url, headers=headers)
|
|
|
+ html = etree.HTML(basic_response.text)
|
|
|
+ result = html.xpath(
|
|
|
+ '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
|
|
|
+ )
|
|
|
+ print(result)
|
|
|
+ async with aiohttp.ClientSession() as session:
|
|
|
+ tasks = [self.get_video_info(session, page_id[1:-2]) for page_id in result]
|
|
|
+ await asyncio.gather(*tasks)
|
|
|
+
|
|
|
+ async def get_video_info(self, session, page_id):
|
|
|
+ """
|
|
|
+ :param session:
|
|
|
+ :param page_id: 视频主页 id
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ url = "https://www.ixigua.com/api/mixVideo/information?"
|
|
|
+ headers = {
|
|
|
+ "accept-encoding": "gzip, deflate",
|
|
|
+ "accept-language": "zh-CN,zh-Hans;q=0.9",
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
|
+ "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
|
|
|
+ }
|
|
|
+ params = {
|
|
|
+ "mixId": str(page_id),
|
|
|
+ "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
|
|
|
+ "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
|
|
|
+ "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
|
|
|
+ "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
|
|
|
+ "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
|
|
|
+ }
|
|
|
+ cookies = {
|
|
|
+ "ixigua-a-s": "1",
|
|
|
+ "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
|
|
|
+ "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
|
|
|
+ "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
|
|
|
+ "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
|
|
|
+ "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
|
|
|
+ "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
|
|
|
+ "__ac_nonce": "06304878000964fdad287",
|
|
|
+ "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
|
|
|
+ "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
|
|
|
+ "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
|
|
|
+ "_tea_utm_cache_1300": "undefined",
|
|
|
+ "support_avif": "false",
|
|
|
+ "support_webp": "false",
|
|
|
+ "xiguavideopcwebid": "7134967546256016900",
|
|
|
+ "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
|
|
|
+ }
|
|
|
+ async with session.get(
|
|
|
+ url, headers=headers, params=params, cookies=cookies
|
|
|
+ ) as response:
|
|
|
+ video_info = await response.json()
|
|
|
+ video_info = (
|
|
|
+ video_info["data"]
|
|
|
+ .get("gidInformation", {})
|
|
|
+ .get("packerData", {})
|
|
|
+ .get("video", {})
|
|
|
+ )
|
|
|
+ # print(video_info)
|
|
|
+ item = VideoItem()
|
|
|
+ item.add_video_info("video_title", video_info.get("title", ""))
|
|
|
+ item.add_video_info(
|
|
|
+ "video_id", video_info.get("videoResource", {}).get("vid", "")
|
|
|
+ )
|
|
|
+ item.add_video_info("play_cnt", int(video_info.get("video_watch_count", 0)))
|
|
|
+ item.add_video_info("like_cnt", int(video_info.get("video_like_count", 0)))
|
|
|
+ item.add_video_info("duration", int(video_info.get("video_duration", 0)))
|
|
|
+ item.add_video_info(
|
|
|
+ "publish_time_stamp", int(video_info.get("video_publish_time", 0))
|
|
|
+ )
|
|
|
+ item.add_video_info(
|
|
|
+ "publish_time_str",
|
|
|
+ time.strftime(
|
|
|
+ "%Y-%m-%d %H:%M:%S",
|
|
|
+ time.localtime(int(video_info.get("video_publish_time", 0))),
|
|
|
+ ),
|
|
|
+ )
|
|
|
+ item.add_video_info(
|
|
|
+ "user_name", video_info.get("user_info", {}).get("name", "")
|
|
|
+ )
|
|
|
+ item.add_video_info(
|
|
|
+ "user_id", str(video_info.get("user_info", {}).get("user_id", ""))
|
|
|
+ )
|
|
|
+ item.add_video_info(
|
|
|
+ "avatar_url", str(video_info.get("user_info", {}).get("avatar_url", ""))
|
|
|
+ )
|
|
|
+ item.add_video_info("cover_url", video_info.get("poster_url", ""))
|
|
|
+ item.add_video_info("audio_url", get_video_url(video_info)["audio_url"])
|
|
|
+ item.add_video_info("video_url", get_video_url(video_info)["video_url"])
|
|
|
+ item.add_video_info("session", "xigua-search-{}".format(int(time.time())))
|
|
|
+ item.add_video_info("out_video_id", video_info.get("videoResource", {}).get("vid", ""))
|
|
|
+ item.add_video_info("platform", self.platform)
|
|
|
+ item.add_video_info("strategy", self.mode)
|
|
|
+ # item.add_video_info("")
|
|
|
+ mq_obj = item.produce_item()
|
|
|
+ # print(mq_obj)
|
|
|
+ print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ S = XiGuaSearch(platform=1, mode=2, rule_dict=3, user_list=1)
|
|
|
+ loop = asyncio.get_event_loop()
|
|
|
+ loop.run_until_complete(S.search("春节"))
|
|
|
+ # await
|