""" 快手搜索爬虫 @Author: luojunhui """ import os import sys import json import time import uuid import random import datetime import requests from lxml import etree sys.path.append(os.getcwd()) from application.items import VideoItem from application.pipeline import PiaoQuanPipeline from application.common.messageQueue import MQ from application.common.proxies import tunnel_proxies from application.common.log import AliyunLogger class KuaiShouSearch(object): """ 快手 Search """ def __init__(self, platform, mode, rule_dict, user_list, env="prod"): self.platform = platform self.mode = mode self.rule_dict = rule_dict self.user_list = user_list self.env = env self.download_cnt = 0 self.mq = MQ(topic_name="topic_crawler_etl_" + self.env) self.expire_flag = False self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode) def search_videos(self, keyword): """ search, 一次搜索只抓 20 条视频 :param keyword: 关键词 :return: video_list """ url = 'https://www.kuaishou.com/graphql' headers = { 'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_5db53a9e49dca57728b58cecb7863868; didv=1698736264000; kpn=KUAISHOU_VISION', 'Origin': 'https://www.kuaishou.com', 'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E8%80%81%E5%B9%B4%E5%A4%A7%E5%AD%A6', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'accept': '*/*', 'content-type': 'application/json', 'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', } data = { "operationName": "visionSearchPhoto", "variables": { "keyword": keyword, "pcursor": "", "page": "search" }, "query": """ fragment photoContent on PhotoEntity { __typename id duration caption originCaption likeCount viewCount commentCount realLikeCount coverUrl photoUrl photoH265Url manifest manifestH265 videoResource coverUrls { url __typename } timestamp expTag animatedCoverUrl distance videoRatio liked stereoType profileUserTopPhoto musicBlocked riskTagContent riskTagUrl } fragment recoPhotoFragment on recoPhotoEntity { __typename id duration caption originCaption likeCount viewCount commentCount realLikeCount coverUrl photoUrl photoH265Url manifest manifestH265 videoResource coverUrls { url __typename } timestamp expTag animatedCoverUrl distance videoRatio liked stereoType profileUserTopPhoto musicBlocked riskTagContent riskTagUrl } fragment feedContent on Feed { type author { id name headerUrl following headerUrls { url __typename } __typename } photo { ...photoContent ...recoPhotoFragment __typename } canAddComment llsid status currentPcursor tags { type name __typename } __typename } query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) { visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) { result llsid webPageArea feeds { ...feedContent __typename } searchSessionId pcursor aladdinBanner { imgUrl link __typename } __typename } } """ } response = requests.post(url, headers=headers, json=data).json() video_list = response['data']['visionSearchPhoto']['feeds'] return video_list def process_video_obj(self, video_obj): """ 处理视频信息 :return: """ # print(json.dumps(video_obj, ensure_ascii=False, indent=4)) trace_id = self.platform + str(uuid.uuid1()) our_user = random.choice(self.user_list) publish_time_stamp = int(video_obj["photo"]["timestamp"] / 1000) item = VideoItem() item.add_video_info("user_id", our_user["uid"]) item.add_video_info("user_name", our_user["nick_name"]) item.add_video_info("video_id", video_obj["photo"]["manifest"]["videoId"]) item.add_video_info("video_title", video_obj["photo"]['caption']) # item.add_video_info("publish_time_str", video_obj["photo"]['timestamp']) item.add_video_info("publish_time_stamp", int(publish_time_stamp)) item.add_video_info("video_url", video_obj["photo"]['manifest']['adaptationSet'][0]['representation'][0]['url']) item.add_video_info( "cover_url", video_obj["photo"]["coverUrl"] ) item.add_video_info("like_cnt", video_obj["photo"]["realLikeCount"]) item.add_video_info("play_cnt", video_obj["photo"]["viewCount"]) item.add_video_info("out_video_id", video_obj["photo"]["manifest"]["videoId"]) item.add_video_info("platform", self.platform) item.add_video_info("strategy", self.mode) item.add_video_info("session", "{}-{}".format(self.platform, int(time.time()))) mq_obj = item.produce_item() print(json.dumps(mq_obj, ensure_ascii=False, indent=4)) if __name__ == '__main__': KS = KuaiShouSearch(platform="kuaishou", mode="search", rule_dict={}, user_list=[{"uid": 1, "nick_name": "ljh"}]) video_list = KS.search_videos("王者荣耀") for i in video_list: KS.process_video_obj(i)