kuaishou_follow_pc.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/2/24
  4. import os
  5. import sys
  6. import requests
  7. import json
  8. sys.path.append(os.getcwd())
  9. from common.common import Common
  10. from common.feishu import Feishu
  11. from common.users import Users
  12. class Follow:
  13. # 翻页参数
  14. pcursor = ""
  15. platform = "快手"
  16. tag = "快手爬虫,定向爬虫策略"
  17. # 过滤词库
  18. @classmethod
  19. def filter_words(cls, log_type, crawler):
  20. try:
  21. while True:
  22. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'HIKVvs')
  23. if filter_words_sheet is None:
  24. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  25. continue
  26. filter_words_list = []
  27. for x in filter_words_sheet:
  28. for y in x:
  29. if y is None:
  30. pass
  31. else:
  32. filter_words_list.append(y)
  33. return filter_words_list
  34. except Exception as e:
  35. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  36. # 获取站外用户信息
  37. @classmethod
  38. def get_out_user_info(cls, log_type, crawler, out_uid):
  39. try:
  40. url = "https://www.kuaishou.com/graphql"
  41. payload = json.dumps({
  42. "operationName": "visionProfile",
  43. "variables": {
  44. "userId": out_uid
  45. },
  46. "query": "query visionProfile($userId: String) {\n visionProfile(userId: $userId) {\n result\n hostName\n userProfile {\n ownerCount {\n fan\n photo\n follow\n photo_public\n __typename\n }\n profile {\n gender\n user_name\n user_id\n headurl\n user_text\n user_profile_bg_url\n __typename\n }\n isFollowing\n __typename\n }\n __typename\n }\n}\n"
  47. })
  48. headers = {
  49. 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_e2901e1c5a13c60af81ba88bc7a3ee24; userId=1921947321; kpn=KUAISHOU_VISION; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABE4wGjnJauApJelOpl9Xqo8TVDAyra7Pvo0rZtVgMSZxgVuw4Z6P2UtHv_CHOk2Ne2el1hdE_McCptWs8tRdtYlhXFlVOu8rQX7CwexzOBudJAfB3lDN8LPc4o4qHNwqFxy5J5j_WzdllbqMmaDUK9yUxX6XA-JFezzq9jvBwtGv7_hzB7pFrUcH39z0EYOQaZo5lDl-pE09Gw7wr8NvlZRoSdWlbobCW6oJxuQLJTUr9oj_uIiBhkeb1psaIIc3VwfYQ1UfvobrXAP_WpnRabE_3UZUBOygFMAE; kuaishou.server.web_ph=2b981e2051d7130c977fd31df97fe6f5ad54',
  50. 'Referer': f'https://www.kuaishou.com/profile/{out_uid}',
  51. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
  52. 'content-type': 'application/json',
  53. # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  54. # 'Cache-Control': 'no-cache',
  55. # 'Connection': 'keep-alive',
  56. # 'Origin': 'https://www.kuaishou.com',
  57. # 'Pragma': 'no-cache',
  58. # 'Sec-Fetch-Dest': 'empty',
  59. # 'Sec-Fetch-Mode': 'cors',
  60. # 'Sec-Fetch-Site': 'same-origin',
  61. # 'accept': '*/*',
  62. # 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"',
  63. # 'sec-ch-ua-mobile': '?0',
  64. # 'sec-ch-ua-platform': '"macOS"'
  65. }
  66. response = requests.post(url=url, headers=headers, data=payload)
  67. if response.status_code != 200:
  68. Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.text}\n")
  69. return
  70. elif 'data' not in response.json():
  71. Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()}\n")
  72. return
  73. elif 'visionProfile' not in response.json()['data']:
  74. Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()['data']}\n")
  75. return
  76. elif 'userProfile' not in response.json()['data']['visionProfile']:
  77. Common.logger(log_type, crawler).warning(f"get_out_user_info_response:{response.json()['data']['visionProfile']['userProfile']}\n")
  78. return
  79. else:
  80. userProfile = response.json()['data']['visionProfile']['userProfile']
  81. out_user_dict = {}
  82. if 'ownerCount' not in userProfile:
  83. out_user_dict['out_fans'] = 0
  84. out_user_dict['out_fans'] = 0
  85. elif 'fan' not in userProfile['ownerCount']:
  86. out_user_dict['out_fans'] = 0
  87. elif 'follow' not in userProfile['ownerCount']:
  88. out_user_dict['out_fans'] = 0
  89. else:
  90. out_fans_str = str(userProfile['ownerCount']['fan'])
  91. out_follow_str = str(userProfile['ownerCount']['follow'])
  92. if "万" in out_fans_str:
  93. out_user_dict['out_fans'] = int(float(out_fans_str.split("万")[0]) * 10000)
  94. else:
  95. out_user_dict['out_fans'] = int(out_fans_str.replace(",", ""))
  96. if "万" in out_follow_str:
  97. out_user_dict['out_follow'] = int(float(out_follow_str.split("万")[0]) * 10000)
  98. else:
  99. out_user_dict['out_follow'] = int(out_follow_str.replace(",", ""))
  100. if 'profile' not in userProfile:
  101. out_user_dict['out_avatar_url'] = ''
  102. elif 'headurl' not in userProfile['profile']:
  103. out_user_dict['out_avatar_url'] = ''
  104. else:
  105. out_user_dict['out_avatar_url'] = userProfile['profile']['headurl']
  106. return out_user_dict
  107. except Exception as e:
  108. Common.logger(log_type, crawler).error(f"get_out_user_info:{e}\n")
  109. # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
  110. @classmethod
  111. def get_user_list(cls, log_type, crawler, sheetid, env, machine):
  112. try:
  113. while True:
  114. user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
  115. if user_sheet is None:
  116. Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试")
  117. continue
  118. our_user_list = []
  119. # for i in range(1, len(user_sheet)):
  120. for i in range(1, 3):
  121. out_uid = user_sheet[i][2]
  122. user_name = user_sheet[i][3]
  123. our_uid = user_sheet[i][6]
  124. our_user_link = user_sheet[i][7]
  125. if out_uid is None or user_name is None:
  126. Common.logger(log_type, crawler).info("空行\n")
  127. else:
  128. Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
  129. if our_uid is None:
  130. out_user_info = cls.get_out_user_info(log_type, crawler, out_uid)
  131. out_user_dict = {
  132. "out_uid": out_uid,
  133. "user_name": user_name,
  134. "out_avatar_url": out_user_info["out_avatar_url"],
  135. "out_create_time": '',
  136. "out_tag": '',
  137. "out_play_cnt": 0,
  138. "out_fans": out_user_info["out_fans"],
  139. "out_follow": out_user_info["out_follow"],
  140. "out_friend": 0,
  141. "out_like": 0,
  142. "platform": cls.platform,
  143. "tag": cls.tag,
  144. }
  145. our_user_dict = Users.create_user(log_type=log_type, crawler=crawler,
  146. out_user_dict=out_user_dict, env=env, machine=machine)
  147. our_uid = our_user_dict['our_uid']
  148. our_user_link = our_user_dict['our_user_link']
  149. Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
  150. [[our_uid, our_user_link]])
  151. Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
  152. our_user_list.append(our_user_dict)
  153. else:
  154. our_user_dict = {
  155. 'out_uid': out_uid,
  156. 'user_name': user_name,
  157. 'our_uid': our_uid,
  158. 'our_user_link': our_user_link,
  159. }
  160. our_user_list.append(our_user_dict)
  161. return our_user_list
  162. except Exception as e:
  163. Common.logger(log_type, crawler).error(f'get_user_list:{e}\n')
  164. @classmethod
  165. def get_videoList(cls, log_type, crawler, out_uid):
  166. url = "https://www.kuaishou.com/graphql"
  167. payload = json.dumps({
  168. "operationName": "visionProfilePhotoList",
  169. "variables": {
  170. "userId": out_uid,
  171. "pcursor": cls.pcursor,
  172. "page": "profile"
  173. },
  174. "query": "fragment photoContent on PhotoEntity {\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n __typename\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
  175. })
  176. headers = {
  177. 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_e2901e1c5a13c60af81ba88bc7a3ee24; userId=1268646616; kuaishou.server.web_st=ChZrdWFpc2hvdS5zZXJ2ZXIud2ViLnN0EqABOLgYYcIJ5ilxU46Jc-HLWThY8sppX3V0htC_KhSGOzAjP2hAOdegzfkZGAxS5rf6rCBS487FkxfYzLkV__I6b1lK16rDjvv94Kkoo4z7mgf8y8rFgWoqrp81JAWTtx00y-wrc1XXPf9RAVQoET70wWaeNG2r5bxtZEiNwpK_zPi0ZdUo0BW13dFKfVssAy2xKYh0UlJ8VSd_vBvyMKSxVBoSf061Kc3w5Nem7YdpVBmH39ceIiBpiGioLzbZqlHiSbwkH_LhUhNXz3o7LITj098KUytk2CgFMAE; kuaishou.server.web_ph=f1033957981996a7d50e849a9ded4cf4adff; kpn=KUAISHOU_VISION',
  178. 'Referer': f'https://www.kuaishou.com/profile/{out_uid}',
  179. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
  180. 'content-type': 'application/json',
  181. # 'accept': '*/*',
  182. # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  183. # 'Cache-Control': 'no-cache',
  184. # 'Connection': 'keep-alive',
  185. # 'Origin': 'https://www.kuaishou.com',
  186. # 'Pragma': 'no-cache',
  187. # 'Sec-Fetch-Dest': 'empty',
  188. # 'Sec-Fetch-Mode': 'cors',
  189. # 'Sec-Fetch-Site': 'same-origin',
  190. # 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"',
  191. # 'sec-ch-ua-mobile': '?0',
  192. # 'sec-ch-ua-platform': '"macOS"'
  193. }
  194. response = requests.post(url=url, headers=headers, data=payload)
  195. if response.status_code != 200:
  196. Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.text}\n")
  197. return
  198. elif 'data' not in response.json():
  199. Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()}\n")
  200. return
  201. elif 'visionProfilePhotoList' not in response.json()['data']:
  202. Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()['data']}\n")
  203. return
  204. elif 'feeds' not in response.json()['data']['visionProfilePhotoList']:
  205. Common.logger(log_type, crawler).warning(f"get_videoList_response:{response.json()['data']['visionProfilePhotoList']}\n")
  206. return
  207. elif len(response.json()['data']['visionProfilePhotoList']['feeds']) == 0:
  208. Common.logger(log_type, crawler).info("没有更多视频啦 ~\n")
  209. return
  210. else:
  211. feeds = response.json()['data']['visionProfilePhotoList']['feeds']
  212. if __name__ == "__main__":
  213. print(Follow.filter_words("follow", "kuaishou"))
  214. # Follow.get_user_list("follow", "kuaishou", "2OLxLr", "dev", "local")
  215. # Follow.get_videoList("3xgh4ja9be3wcaw")
  216. pass