| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 | """快手搜索爬虫@Author: luojunhui"""import osimport sysimport jsonimport timeimport uuidimport randomimport datetimeimport requestsfrom lxml import etreesys.path.append(os.getcwd())from application.items import VideoItemfrom application.pipeline import PiaoQuanPipelinefrom application.common.messageQueue import MQfrom application.common.proxies import tunnel_proxiesfrom application.common.log import AliyunLoggerclass KuaiShouSearch(object):    """    快手 Search    """    def __init__(self, platform, mode, rule_dict, user_list, env="prod"):        self.platform = platform        self.mode = mode        self.rule_dict = rule_dict        self.user_list = user_list        self.env = env        self.download_cnt = 0        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)        self.expire_flag = False        self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)    def search_videos(self, keyword):        """        search, 一次搜索只抓 20 条视频        :param keyword: 关键词        :return: video_list        """        url = 'https://www.kuaishou.com/graphql'        headers = {            'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8',            'Connection': 'keep-alive',            'Cookie': 'kpf=PC_WEB; clientid=3; did=web_5db53a9e49dca57728b58cecb7863868; didv=1698736264000; kpn=KUAISHOU_VISION',            'Origin': 'https://www.kuaishou.com',            'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E8%80%81%E5%B9%B4%E5%A4%A7%E5%AD%A6',            'Sec-Fetch-Dest': 'empty',            'Sec-Fetch-Mode': 'cors',            'Sec-Fetch-Site': 'same-origin',            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',            'accept': '*/*',            'content-type': 'application/json',            'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',            'sec-ch-ua-mobile': '?0',            'sec-ch-ua-platform': '"macOS"',        }        data = {            "operationName": "visionSearchPhoto",            "variables": {                "keyword": keyword,                "pcursor": "",                "page": "search"            },            "query": """            fragment photoContent on PhotoEntity {              __typename              id              duration              caption              originCaption              likeCount              viewCount              commentCount              realLikeCount              coverUrl              photoUrl              photoH265Url              manifest              manifestH265              videoResource              coverUrls {                url                __typename              }              timestamp              expTag              animatedCoverUrl              distance              videoRatio              liked              stereoType              profileUserTopPhoto              musicBlocked              riskTagContent              riskTagUrl            }            fragment recoPhotoFragment on recoPhotoEntity {              __typename              id              duration              caption              originCaption              likeCount              viewCount              commentCount              realLikeCount              coverUrl              photoUrl              photoH265Url              manifest              manifestH265              videoResource              coverUrls {                url                __typename              }              timestamp              expTag              animatedCoverUrl              distance              videoRatio              liked              stereoType              profileUserTopPhoto              musicBlocked              riskTagContent              riskTagUrl            }            fragment feedContent on Feed {              type              author {                id                name                headerUrl                following                headerUrls {                  url                  __typename                }                __typename              }              photo {                ...photoContent                ...recoPhotoFragment                __typename              }              canAddComment              llsid              status              currentPcursor              tags {                type                name                __typename              }              __typename            }            query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {              visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {                result                llsid                webPageArea                feeds {                  ...feedContent                  __typename                }                searchSessionId                pcursor                aladdinBanner {                  imgUrl                  link                  __typename                }                __typename              }            }            """        }        response = requests.post(url, headers=headers, json=data).json()        video_list = response['data']['visionSearchPhoto']['feeds']        return video_list    def process_video_obj(self, video_obj):        """        处理视频信息        :return:        """        # print(json.dumps(video_obj, ensure_ascii=False, indent=4))        trace_id = self.platform + str(uuid.uuid1())        our_user = random.choice(self.user_list)        publish_time_stamp = int(video_obj["photo"]["timestamp"] / 1000)        item = VideoItem()        item.add_video_info("user_id", our_user["uid"])        item.add_video_info("user_name", our_user["nick_name"])        item.add_video_info("video_id", video_obj["photo"]["manifest"]["videoId"])        item.add_video_info("video_title", video_obj["photo"]['caption'])        # item.add_video_info("publish_time_str", video_obj["photo"]['timestamp'])        item.add_video_info("publish_time_stamp", int(publish_time_stamp))        item.add_video_info("video_url", video_obj["photo"]['manifest']['adaptationSet'][0]['representation'][0]['url'])        item.add_video_info(            "cover_url", video_obj["photo"]["coverUrl"]        )        item.add_video_info("like_cnt", video_obj["photo"]["realLikeCount"])        item.add_video_info("play_cnt", video_obj["photo"]["viewCount"])        item.add_video_info("out_video_id", video_obj["photo"]["manifest"]["videoId"])        item.add_video_info("platform", self.platform)        item.add_video_info("strategy", self.mode)        item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))        mq_obj = item.produce_item()        print(json.dumps(mq_obj, ensure_ascii=False, indent=4))if __name__ == '__main__':    KS = KuaiShouSearch(platform="kuaishou", mode="search", rule_dict={}, user_list=[{"uid": 1, "nick_name": "ljh"}])    video_list = KS.search_videos("王者荣耀")    for i in video_list:        KS.process_video_obj(i)
 |