# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/2/24 import os import sys import requests import json sys.path.append(os.getcwd()) from common.common import Common from common.feishu import Feishu from common.users import Users class Follow: # 翻页参数 pcursor = "" platform = "快手" tag = "快手爬虫,定向爬虫策略" # 过滤词库 @classmethod def filter_words(cls, log_type, crawler): try: while True: filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'HIKVvs') if filter_words_sheet is None: Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试") continue filter_words_list = [] for x in filter_words_sheet: for y in x: if y is None: pass else: filter_words_list.append(y) return filter_words_list except Exception as e: Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n') # 获取站外用户信息 @classmethod def get_out_user_info(cls, log_type, crawler, out_uid): try: url = "https://www.kuaishou.com/graphql" payload = json.dumps({ "operationName": "visionProfile", "variables": { "userId": out_uid }, "query": "query visionProfile($userId: String) {\n visionProfile(userId: $userId) {\n result\n hostName\n userProfile {\n ownerCount {\n fan\n photo\n follow\n photo_public\n __typename\n }\n profile {\n gender\n user_name\n user_id\n headurl\n user_text\n user_profile_bg_url\n __typename\n }\n isFollowing\n __typename\n }\n __typename\n }\n}\n" }) headers = { 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_e2901e1c5a13c60af81ba88bc7a3ee24; userId=1921947321; kpn=KUAISHOU_VISION; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABE4wGjnJauApJelOpl9Xqo8TVDAyra7Pvo0rZtVgMSZxgVuw4Z6P2UtHv_CHOk2Ne2el1hdE_McCptWs8tRdtYlhXFlVOu8rQX7CwexzOBudJAfB3lDN8LPc4o4qHNwqFxy5J5j_WzdllbqMmaDUK9yUxX6XA-JFezzq9jvBwtGv7_hzB7pFrUcH39z0EYOQaZo5lDl-pE09Gw7wr8NvlZRoSdWlbobCW6oJxuQLJTUr9oj_uIiBhkeb1psaIIc3VwfYQ1UfvobrXAP_WpnRabE_3UZUBOygFMAE; kuaishou.server.web_ph=2b981e2051d7130c977fd31df97fe6f5ad54', 'Referer': f'https://www.kuaishou.com/profile/{out_uid}', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41', 'content-type': 'application/json', # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', # 'Cache-Control': 'no-cache', # 'Connection': 'keep-alive', # 'Origin': 'https://www.kuaishou.com', # 'Pragma': 'no-cache', # 'Sec-Fetch-Dest': 'empty', # 'Sec-Fetch-Mode': 'cors', # 'Sec-Fetch-Site': 'same-origin', # 'accept': '*/*', # 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"', # 'sec-ch-ua-mobile': '?0', # 'sec-ch-ua-platform': '"macOS"' } response = requests.post(url=url, headers=headers, data=payload) if response.status_code != 200: Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.text}\n") return elif 'data' not in response.json(): Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()}\n") return elif 'visionProfile' not in response.json()['data']: Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()['data']}\n") return elif 'userProfile' not in response.json()['data']['visionProfile']: Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()['data']['visionProfile']['userProfile']}\n") return else: userProfile = response.json()['data']['visionProfile']['userProfile'] out_user_dict = {} if 'ownerCount' not in userProfile: out_user_dict['out_fans'] = 0 out_user_dict['out_fans'] = 0 elif 'fan' not in userProfile['ownerCount']: out_user_dict['out_fans'] = 0 elif 'follow' not in userProfile['ownerCount']: out_user_dict['out_fans'] = 0 else: out_fans_str = str(userProfile['ownerCount']['fan']) out_follow_str = str(userProfile['ownerCount']['follow']) if "万" in out_fans_str: out_user_dict['out_fans'] = int(float(out_fans_str.split("万")[0]) * 10000) else: out_user_dict['out_fans'] = int(out_fans_str.replace(",", "")) if "万" in out_follow_str: out_user_dict['out_follow'] = int(float(out_follow_str.split("万")[0]) * 10000) else: out_user_dict['out_follow'] = int(out_follow_str.replace(",", "")) if 'profile' not in userProfile: out_user_dict['out_avatar_url'] = '' elif 'headurl' not in userProfile['profile']: out_user_dict['out_avatar_url'] = '' else: out_user_dict['out_avatar_url'] = userProfile['profile']['headurl'] return out_user_dict except Exception as e: Common.logger(log_type, crawler).error(f"get_out_user_info:{e}\n") # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str @classmethod def get_user_list(cls, log_type, crawler, sheetid, env, machine): try: while True: user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid) if user_sheet is None: Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试") continue our_user_list = [] # for i in range(1, len(user_sheet)): for i in range(1, 3): out_uid = user_sheet[i][2] user_name = user_sheet[i][3] our_uid = user_sheet[i][6] our_user_link = user_sheet[i][7] if out_uid is None or user_name is None: Common.logger(log_type, crawler).info("空行\n") else: Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n") if our_uid is None: out_user_info = cls.get_out_user_info(log_type, crawler, out_uid) out_user_dict = { "out_uid": out_uid, "user_name": user_name, "out_avatar_url": out_user_info["out_avatar_url"], "out_create_time": '', "out_tag": '', "out_play_cnt": 0, "out_fans": out_user_info["out_fans"], "out_follow": out_user_info["out_follow"], "out_friend": 0, "out_like": 0, "platform": cls.platform, "tag": cls.tag, } our_user_dict = Users.create_user(log_type=log_type, crawler=crawler, out_user_dict=out_user_dict, env=env, machine=machine) our_uid = our_user_dict['our_uid'] our_user_link = our_user_dict['our_user_link'] Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]]) Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n') our_user_list.append(our_user_dict) else: our_user_dict = { 'out_uid': out_uid, 'user_name': user_name, 'our_uid': our_uid, 'our_user_link': our_user_link, } our_user_list.append(our_user_dict) return our_user_list except Exception as e: Common.logger(log_type, crawler).error(f'get_user_list:{e}\n') @classmethod def get_videoList(cls, log_type, crawler, out_uid): url = "https://www.kuaishou.com/graphql" payload = json.dumps({ "operationName": "visionProfilePhotoList", "variables": { "userId": out_uid, "pcursor": cls.pcursor, "page": "profile" }, "query": "fragment photoContent on PhotoEntity {\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n __typename\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n" }) headers = { 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_e2901e1c5a13c60af81ba88bc7a3ee24; userId=1268646616; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABOLgYYcIJ5ilxU46Jc-HLWThY8sppX3V0htC_KhSGOzAjP2hAOdegzfkZGAxS5rf6rCBS487FkxfYzLkV__I6b1lK16rDjvv94Kkoo4z7mgf8y8rFgWoqrp81JAWTtx00y-wrc1XXPf9RAVQoET70wWaeNG2r5bxtZEiNwpK_zPi0ZdUo0BW13dFKfVssAy2xKYh0UlJ8VSd_vBvyMKSxVBoSf061Kc3w5Nem7YdpVBmH39ceIiBpiGioLzbZqlHiSbwkH_LhUhNXz3o7LITj098KUytk2CgFMAE; kuaishou.server.web_ph=f1033957981996a7d50e849a9ded4cf4adff; kpn=KUAISHOU_VISION', 'Referer': f'https://www.kuaishou.com/profile/{out_uid}', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41', 'content-type': 'application/json', # 'accept': '*/*', # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', # 'Cache-Control': 'no-cache', # 'Connection': 'keep-alive', # 'Origin': 'https://www.kuaishou.com', # 'Pragma': 'no-cache', # 'Sec-Fetch-Dest': 'empty', # 'Sec-Fetch-Mode': 'cors', # 'Sec-Fetch-Site': 'same-origin', # 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"', # 'sec-ch-ua-mobile': '?0', # 'sec-ch-ua-platform': '"macOS"' } response = requests.post(url=url, headers=headers, data=payload) if response.status_code != 200: Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.text}\n") return elif 'data' not in response.json(): Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()}\n") return elif 'visionProfilePhotoList' not in response.json()['data']: Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()['data']}\n") return elif 'feeds' not in response.json()['data']['visionProfilePhotoList']: Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()['data']['visionProfilePhotoList']}\n") return elif len(response.json()['data']['visionProfilePhotoList']['feeds']) == 0: Common.logger(log_type, crawler).info("没有更多视频啦 ~\n") return else: feeds = response.json()['data']['visionProfilePhotoList']['feeds'] if __name__ == "__main__": print(Follow.filter_words("follow", "kuaishou")) # Follow.get_user_list("follow", "kuaishou", "2OLxLr", "dev", "local") # Follow.get_videoList("3xgh4ja9be3wcaw") pass