|
@@ -0,0 +1,228 @@
|
|
|
+"""
|
|
|
+快手搜索爬虫
|
|
|
+@Author: luojunhui
|
|
|
+"""
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import json
|
|
|
+import time
|
|
|
+import uuid
|
|
|
+import random
|
|
|
+import datetime
|
|
|
+
|
|
|
+import requests
|
|
|
+from lxml import etree
|
|
|
+
|
|
|
+sys.path.append(os.getcwd())
|
|
|
+
|
|
|
+from application.items import VideoItem
|
|
|
+from application.pipeline import PiaoQuanPipeline
|
|
|
+from application.common.messageQueue import MQ
|
|
|
+from application.common.proxies import tunnel_proxies
|
|
|
+from application.common.log import AliyunLogger
|
|
|
+
|
|
|
+
|
|
|
+class KuaiShouSearch(object):
|
|
|
+ """
|
|
|
+ 快手 Search
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
|
|
|
+ self.platform = platform
|
|
|
+ self.mode = mode
|
|
|
+ self.rule_dict = rule_dict
|
|
|
+ self.user_list = user_list
|
|
|
+ self.env = env
|
|
|
+ self.download_cnt = 0
|
|
|
+ self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
+ self.expire_flag = False
|
|
|
+ self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
|
|
|
+
|
|
|
+ def search_videos(self, keyword):
|
|
|
+ """
|
|
|
+ search, 一次搜索只抓 20 条视频
|
|
|
+ :param keyword: 关键词
|
|
|
+ :return: video_list
|
|
|
+ """
|
|
|
+ url = 'https://www.kuaishou.com/graphql'
|
|
|
+ headers = {
|
|
|
+ 'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8',
|
|
|
+ 'Connection': 'keep-alive',
|
|
|
+ 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_5db53a9e49dca57728b58cecb7863868; didv=1698736264000; kpn=KUAISHOU_VISION',
|
|
|
+ 'Origin': 'https://www.kuaishou.com',
|
|
|
+ 'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E8%80%81%E5%B9%B4%E5%A4%A7%E5%AD%A6',
|
|
|
+ 'Sec-Fetch-Dest': 'empty',
|
|
|
+ 'Sec-Fetch-Mode': 'cors',
|
|
|
+ 'Sec-Fetch-Site': 'same-origin',
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
|
+ 'accept': '*/*',
|
|
|
+ 'content-type': 'application/json',
|
|
|
+ 'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
|
|
+ 'sec-ch-ua-mobile': '?0',
|
|
|
+ 'sec-ch-ua-platform': '"macOS"',
|
|
|
+ }
|
|
|
+ data = {
|
|
|
+ "operationName": "visionSearchPhoto",
|
|
|
+ "variables": {
|
|
|
+ "keyword": keyword,
|
|
|
+ "pcursor": "",
|
|
|
+ "page": "search"
|
|
|
+ },
|
|
|
+ "query": """
|
|
|
+ fragment photoContent on PhotoEntity {
|
|
|
+ __typename
|
|
|
+ id
|
|
|
+ duration
|
|
|
+ caption
|
|
|
+ originCaption
|
|
|
+ likeCount
|
|
|
+ viewCount
|
|
|
+ commentCount
|
|
|
+ realLikeCount
|
|
|
+ coverUrl
|
|
|
+ photoUrl
|
|
|
+ photoH265Url
|
|
|
+ manifest
|
|
|
+ manifestH265
|
|
|
+ videoResource
|
|
|
+ coverUrls {
|
|
|
+ url
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ timestamp
|
|
|
+ expTag
|
|
|
+ animatedCoverUrl
|
|
|
+ distance
|
|
|
+ videoRatio
|
|
|
+ liked
|
|
|
+ stereoType
|
|
|
+ profileUserTopPhoto
|
|
|
+ musicBlocked
|
|
|
+ riskTagContent
|
|
|
+ riskTagUrl
|
|
|
+ }
|
|
|
+
|
|
|
+ fragment recoPhotoFragment on recoPhotoEntity {
|
|
|
+ __typename
|
|
|
+ id
|
|
|
+ duration
|
|
|
+ caption
|
|
|
+ originCaption
|
|
|
+ likeCount
|
|
|
+ viewCount
|
|
|
+ commentCount
|
|
|
+ realLikeCount
|
|
|
+ coverUrl
|
|
|
+ photoUrl
|
|
|
+ photoH265Url
|
|
|
+ manifest
|
|
|
+ manifestH265
|
|
|
+ videoResource
|
|
|
+ coverUrls {
|
|
|
+ url
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ timestamp
|
|
|
+ expTag
|
|
|
+ animatedCoverUrl
|
|
|
+ distance
|
|
|
+ videoRatio
|
|
|
+ liked
|
|
|
+ stereoType
|
|
|
+ profileUserTopPhoto
|
|
|
+ musicBlocked
|
|
|
+ riskTagContent
|
|
|
+ riskTagUrl
|
|
|
+ }
|
|
|
+
|
|
|
+ fragment feedContent on Feed {
|
|
|
+ type
|
|
|
+ author {
|
|
|
+ id
|
|
|
+ name
|
|
|
+ headerUrl
|
|
|
+ following
|
|
|
+ headerUrls {
|
|
|
+ url
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ photo {
|
|
|
+ ...photoContent
|
|
|
+ ...recoPhotoFragment
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ canAddComment
|
|
|
+ llsid
|
|
|
+ status
|
|
|
+ currentPcursor
|
|
|
+ tags {
|
|
|
+ type
|
|
|
+ name
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+
|
|
|
+ query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
|
|
|
+ visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
|
|
|
+ result
|
|
|
+ llsid
|
|
|
+ webPageArea
|
|
|
+ feeds {
|
|
|
+ ...feedContent
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ searchSessionId
|
|
|
+ pcursor
|
|
|
+ aladdinBanner {
|
|
|
+ imgUrl
|
|
|
+ link
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ __typename
|
|
|
+ }
|
|
|
+ }
|
|
|
+ """
|
|
|
+ }
|
|
|
+ response = requests.post(url, headers=headers, json=data).json()
|
|
|
+ video_list = response['data']['visionSearchPhoto']['feeds']
|
|
|
+ return video_list
|
|
|
+
|
|
|
+ def process_video_obj(self, video_obj):
|
|
|
+ """
|
|
|
+ 处理视频信息
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ # print(json.dumps(video_obj, ensure_ascii=False, indent=4))
|
|
|
+ trace_id = self.platform + str(uuid.uuid1())
|
|
|
+ our_user = random.choice(self.user_list)
|
|
|
+ publish_time_stamp = int(video_obj["photo"]["timestamp"] / 1000)
|
|
|
+
|
|
|
+ item = VideoItem()
|
|
|
+ item.add_video_info("user_id", our_user["uid"])
|
|
|
+ item.add_video_info("user_name", our_user["nick_name"])
|
|
|
+ item.add_video_info("video_id", video_obj["photo"]["manifest"]["videoId"])
|
|
|
+ item.add_video_info("video_title", video_obj["photo"]['caption'])
|
|
|
+ # item.add_video_info("publish_time_str", video_obj["photo"]['timestamp'])
|
|
|
+ item.add_video_info("publish_time_stamp", int(publish_time_stamp))
|
|
|
+ item.add_video_info("video_url", video_obj["photo"]['manifest']['adaptationSet'][0]['representation'][0]['url'])
|
|
|
+ item.add_video_info(
|
|
|
+ "cover_url", video_obj["photo"]["coverUrl"]
|
|
|
+ )
|
|
|
+ item.add_video_info("like_cnt", video_obj["photo"]["realLikeCount"])
|
|
|
+ item.add_video_info("play_cnt", video_obj["photo"]["viewCount"])
|
|
|
+ item.add_video_info("out_video_id", video_obj["photo"]["manifest"]["videoId"])
|
|
|
+ item.add_video_info("platform", self.platform)
|
|
|
+ item.add_video_info("strategy", self.mode)
|
|
|
+ item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
|
|
|
+ mq_obj = item.produce_item()
|
|
|
+ print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ KS = KuaiShouSearch(platform="kuaishou", mode="search", rule_dict={}, user_list=[{"uid": 1, "nick_name": "ljh"}])
|
|
|
+ video_list = KS.search_videos("王者荣耀")
|
|
|
+ for i in video_list:
|
|
|
+ KS.process_video_obj(i)
|