123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- """
- 快手搜索爬虫
- @Author: luojunhui
- """
- import os
- import sys
- import json
- import time
- import uuid
- import random
- import datetime
- import requests
- from lxml import etree
- sys.path.append(os.getcwd())
- from application.items import VideoItem
- from application.pipeline import PiaoQuanPipeline
- from application.common.messageQueue import MQ
- from application.common.proxies import tunnel_proxies
- from application.common.log import AliyunLogger
- class KuaiShouSearch(object):
- """
- 快手 Search
- """
- def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
- self.platform = platform
- self.mode = mode
- self.rule_dict = rule_dict
- self.user_list = user_list
- self.env = env
- self.download_cnt = 0
- self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
- self.expire_flag = False
- self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
- def search_videos(self, keyword):
- """
- search, 一次搜索只抓 20 条视频
- :param keyword: 关键词
- :return: video_list
- """
- url = 'https://www.kuaishou.com/graphql'
- headers = {
- 'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8',
- 'Connection': 'keep-alive',
- 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_5db53a9e49dca57728b58cecb7863868; didv=1698736264000; kpn=KUAISHOU_VISION',
- 'Origin': 'https://www.kuaishou.com',
- 'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E8%80%81%E5%B9%B4%E5%A4%A7%E5%AD%A6',
- 'Sec-Fetch-Dest': 'empty',
- 'Sec-Fetch-Mode': 'cors',
- 'Sec-Fetch-Site': 'same-origin',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
- 'accept': '*/*',
- 'content-type': 'application/json',
- 'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"macOS"',
- }
- data = {
- "operationName": "visionSearchPhoto",
- "variables": {
- "keyword": keyword,
- "pcursor": "",
- "page": "search"
- },
- "query": """
- fragment photoContent on PhotoEntity {
- __typename
- id
- duration
- caption
- originCaption
- likeCount
- viewCount
- commentCount
- realLikeCount
- coverUrl
- photoUrl
- photoH265Url
- manifest
- manifestH265
- videoResource
- coverUrls {
- url
- __typename
- }
- timestamp
- expTag
- animatedCoverUrl
- distance
- videoRatio
- liked
- stereoType
- profileUserTopPhoto
- musicBlocked
- riskTagContent
- riskTagUrl
- }
- fragment recoPhotoFragment on recoPhotoEntity {
- __typename
- id
- duration
- caption
- originCaption
- likeCount
- viewCount
- commentCount
- realLikeCount
- coverUrl
- photoUrl
- photoH265Url
- manifest
- manifestH265
- videoResource
- coverUrls {
- url
- __typename
- }
- timestamp
- expTag
- animatedCoverUrl
- distance
- videoRatio
- liked
- stereoType
- profileUserTopPhoto
- musicBlocked
- riskTagContent
- riskTagUrl
- }
- fragment feedContent on Feed {
- type
- author {
- id
- name
- headerUrl
- following
- headerUrls {
- url
- __typename
- }
- __typename
- }
- photo {
- ...photoContent
- ...recoPhotoFragment
- __typename
- }
- canAddComment
- llsid
- status
- currentPcursor
- tags {
- type
- name
- __typename
- }
- __typename
- }
- query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
- visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
- result
- llsid
- webPageArea
- feeds {
- ...feedContent
- __typename
- }
- searchSessionId
- pcursor
- aladdinBanner {
- imgUrl
- link
- __typename
- }
- __typename
- }
- }
- """
- }
- response = requests.post(url, headers=headers, json=data).json()
- video_list = response['data']['visionSearchPhoto']['feeds']
- return video_list
- def process_video_obj(self, video_obj):
- """
- 处理视频信息
- :return:
- """
- # print(json.dumps(video_obj, ensure_ascii=False, indent=4))
- trace_id = self.platform + str(uuid.uuid1())
- our_user = random.choice(self.user_list)
- publish_time_stamp = int(video_obj["photo"]["timestamp"] / 1000)
- item = VideoItem()
- item.add_video_info("user_id", our_user["uid"])
- item.add_video_info("user_name", our_user["nick_name"])
- item.add_video_info("video_id", video_obj["photo"]["manifest"]["videoId"])
- item.add_video_info("video_title", video_obj["photo"]['caption'])
- # item.add_video_info("publish_time_str", video_obj["photo"]['timestamp'])
- item.add_video_info("publish_time_stamp", int(publish_time_stamp))
- item.add_video_info("video_url", video_obj["photo"]['manifest']['adaptationSet'][0]['representation'][0]['url'])
- item.add_video_info(
- "cover_url", video_obj["photo"]["coverUrl"]
- )
- item.add_video_info("like_cnt", video_obj["photo"]["realLikeCount"])
- item.add_video_info("play_cnt", video_obj["photo"]["viewCount"])
- item.add_video_info("out_video_id", video_obj["photo"]["manifest"]["videoId"])
- item.add_video_info("platform", self.platform)
- item.add_video_info("strategy", self.mode)
- item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
- mq_obj = item.produce_item()
- print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
- if __name__ == '__main__':
- KS = KuaiShouSearch(platform="kuaishou", mode="search", rule_dict={}, user_list=[{"uid": 1, "nick_name": "ljh"}])
- video_list = KS.search_videos("王者荣耀")
- for i in video_list:
- KS.process_video_obj(i)
|