123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/2/24
- import os
- import sys
- import requests
- import json
- sys.path.append(os.getcwd())
- from common.common import Common
- from common.feishu import Feishu
- from common.users import Users
- class Follow:
- # 翻页参数
- pcursor = ""
- platform = "快手"
- tag = "快手爬虫,定向爬虫策略"
- # 过滤词库
- @classmethod
- def filter_words(cls, log_type, crawler):
- try:
- while True:
- filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'HIKVvs')
- if filter_words_sheet is None:
- Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
- continue
- filter_words_list = []
- for x in filter_words_sheet:
- for y in x:
- if y is None:
- pass
- else:
- filter_words_list.append(y)
- return filter_words_list
- except Exception as e:
- Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
- # 获取站外用户信息
- @classmethod
- def get_out_user_info(cls, log_type, crawler, out_uid):
- try:
- url = "https://www.kuaishou.com/graphql"
- payload = json.dumps({
- "operationName": "visionProfile",
- "variables": {
- "userId": out_uid
- },
- "query": "query visionProfile($userId: String) {\n visionProfile(userId: $userId) {\n result\n hostName\n userProfile {\n ownerCount {\n fan\n photo\n follow\n photo_public\n __typename\n }\n profile {\n gender\n user_name\n user_id\n headurl\n user_text\n user_profile_bg_url\n __typename\n }\n isFollowing\n __typename\n }\n __typename\n }\n}\n"
- })
- headers = {
- 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_e2901e1c5a13c60af81ba88bc7a3ee24; userId=1921947321; kpn=KUAISHOU_VISION; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABE4wGjnJauApJelOpl9Xqo8TVDAyra7Pvo0rZtVgMSZxgVuw4Z6P2UtHv_CHOk2Ne2el1hdE_McCptWs8tRdtYlhXFlVOu8rQX7CwexzOBudJAfB3lDN8LPc4o4qHNwqFxy5J5j_WzdllbqMmaDUK9yUxX6XA-JFezzq9jvBwtGv7_hzB7pFrUcH39z0EYOQaZo5lDl-pE09Gw7wr8NvlZRoSdWlbobCW6oJxuQLJTUr9oj_uIiBhkeb1psaIIc3VwfYQ1UfvobrXAP_WpnRabE_3UZUBOygFMAE; kuaishou.server.web_ph=2b981e2051d7130c977fd31df97fe6f5ad54',
- 'Referer': f'https://www.kuaishou.com/profile/{out_uid}',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
- 'content-type': 'application/json',
- # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
- # 'Cache-Control': 'no-cache',
- # 'Connection': 'keep-alive',
- # 'Origin': 'https://www.kuaishou.com',
- # 'Pragma': 'no-cache',
- # 'Sec-Fetch-Dest': 'empty',
- # 'Sec-Fetch-Mode': 'cors',
- # 'Sec-Fetch-Site': 'same-origin',
- # 'accept': '*/*',
- # 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"',
- # 'sec-ch-ua-mobile': '?0',
- # 'sec-ch-ua-platform': '"macOS"'
- }
- response = requests.post(url=url, headers=headers, data=payload)
- if response.status_code != 200:
- Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.text}\n")
- return
- elif 'data' not in response.json():
- Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()}\n")
- return
- elif 'visionProfile' not in response.json()['data']:
- Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()['data']}\n")
- return
- elif 'userProfile' not in response.json()['data']['visionProfile']:
- Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()['data']['visionProfile']['userProfile']}\n")
- return
- else:
- userProfile = response.json()['data']['visionProfile']['userProfile']
- out_user_dict = {}
- if 'ownerCount' not in userProfile:
- out_user_dict['out_fans'] = 0
- out_user_dict['out_fans'] = 0
- elif 'fan' not in userProfile['ownerCount']:
- out_user_dict['out_fans'] = 0
- elif 'follow' not in userProfile['ownerCount']:
- out_user_dict['out_fans'] = 0
- else:
- out_fans_str = str(userProfile['ownerCount']['fan'])
- out_follow_str = str(userProfile['ownerCount']['follow'])
- if "万" in out_fans_str:
- out_user_dict['out_fans'] = int(float(out_fans_str.split("万")[0]) * 10000)
- else:
- out_user_dict['out_fans'] = int(out_fans_str.replace(",", ""))
- if "万" in out_follow_str:
- out_user_dict['out_follow'] = int(float(out_follow_str.split("万")[0]) * 10000)
- else:
- out_user_dict['out_follow'] = int(out_follow_str.replace(",", ""))
- if 'profile' not in userProfile:
- out_user_dict['out_avatar_url'] = ''
- elif 'headurl' not in userProfile['profile']:
- out_user_dict['out_avatar_url'] = ''
- else:
- out_user_dict['out_avatar_url'] = userProfile['profile']['headurl']
- return out_user_dict
- except Exception as e:
- Common.logger(log_type, crawler).error(f"get_out_user_info:{e}\n")
- # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
- @classmethod
- def get_user_list(cls, log_type, crawler, sheetid, env, machine):
- try:
- while True:
- user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
- if user_sheet is None:
- Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试")
- continue
- our_user_list = []
- # for i in range(1, len(user_sheet)):
- for i in range(1, 3):
- out_uid = user_sheet[i][2]
- user_name = user_sheet[i][3]
- our_uid = user_sheet[i][6]
- our_user_link = user_sheet[i][7]
- if out_uid is None or user_name is None:
- Common.logger(log_type, crawler).info("空行\n")
- else:
- Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
- if our_uid is None:
- out_user_info = cls.get_out_user_info(log_type, crawler, out_uid)
- out_user_dict = {
- "out_uid": out_uid,
- "user_name": user_name,
- "out_avatar_url": out_user_info["out_avatar_url"],
- "out_create_time": '',
- "out_tag": '',
- "out_play_cnt": 0,
- "out_fans": out_user_info["out_fans"],
- "out_follow": out_user_info["out_follow"],
- "out_friend": 0,
- "out_like": 0,
- "platform": cls.platform,
- "tag": cls.tag,
- }
- our_user_dict = Users.create_user(log_type=log_type, crawler=crawler,
- out_user_dict=out_user_dict, env=env, machine=machine)
- our_uid = our_user_dict['our_uid']
- our_user_link = our_user_dict['our_user_link']
- Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
- [[our_uid, our_user_link]])
- Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
- our_user_list.append(our_user_dict)
- else:
- our_user_dict = {
- 'out_uid': out_uid,
- 'user_name': user_name,
- 'our_uid': our_uid,
- 'our_user_link': our_user_link,
- }
- our_user_list.append(our_user_dict)
- return our_user_list
- except Exception as e:
- Common.logger(log_type, crawler).error(f'get_user_list:{e}\n')
- @classmethod
- def get_videoList(cls, log_type, crawler, out_uid):
- url = "https://www.kuaishou.com/graphql"
- payload = json.dumps({
- "operationName": "visionProfilePhotoList",
- "variables": {
- "userId": out_uid,
- "pcursor": cls.pcursor,
- "page": "profile"
- },
- "query": "fragment photoContent on PhotoEntity {\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n __typename\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
- })
- headers = {
- 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_e2901e1c5a13c60af81ba88bc7a3ee24; userId=1268646616; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABOLgYYcIJ5ilxU46Jc-HLWThY8sppX3V0htC_KhSGOzAjP2hAOdegzfkZGAxS5rf6rCBS487FkxfYzLkV__I6b1lK16rDjvv94Kkoo4z7mgf8y8rFgWoqrp81JAWTtx00y-wrc1XXPf9RAVQoET70wWaeNG2r5bxtZEiNwpK_zPi0ZdUo0BW13dFKfVssAy2xKYh0UlJ8VSd_vBvyMKSxVBoSf061Kc3w5Nem7YdpVBmH39ceIiBpiGioLzbZqlHiSbwkH_LhUhNXz3o7LITj098KUytk2CgFMAE; kuaishou.server.web_ph=f1033957981996a7d50e849a9ded4cf4adff; kpn=KUAISHOU_VISION',
- 'Referer': f'https://www.kuaishou.com/profile/{out_uid}',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
- 'content-type': 'application/json',
- # 'accept': '*/*',
- # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
- # 'Cache-Control': 'no-cache',
- # 'Connection': 'keep-alive',
- # 'Origin': 'https://www.kuaishou.com',
- # 'Pragma': 'no-cache',
- # 'Sec-Fetch-Dest': 'empty',
- # 'Sec-Fetch-Mode': 'cors',
- # 'Sec-Fetch-Site': 'same-origin',
- # 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"',
- # 'sec-ch-ua-mobile': '?0',
- # 'sec-ch-ua-platform': '"macOS"'
- }
- response = requests.post(url=url, headers=headers, data=payload)
- if response.status_code != 200:
- Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.text}\n")
- return
- elif 'data' not in response.json():
- Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()}\n")
- return
- elif 'visionProfilePhotoList' not in response.json()['data']:
- Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()['data']}\n")
- return
- elif 'feeds' not in response.json()['data']['visionProfilePhotoList']:
- Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()['data']['visionProfilePhotoList']}\n")
- return
- elif len(response.json()['data']['visionProfilePhotoList']['feeds']) == 0:
- Common.logger(log_type, crawler).info("没有更多视频啦 ~\n")
- return
- else:
- feeds = response.json()['data']['visionProfilePhotoList']['feeds']
- if __name__ == "__main__":
- print(Follow.filter_words("follow", "kuaishou"))
- # Follow.get_user_list("follow", "kuaishou", "2OLxLr", "dev", "local")
- # Follow.get_videoList("3xgh4ja9be3wcaw")
- pass
|