kuaishou_search.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. """
  2. 快手搜索爬虫
  3. @Author: luojunhui
  4. """
  5. import os
  6. import sys
  7. import json
  8. import time
  9. import uuid
  10. import random
  11. import datetime
  12. import requests
  13. from lxml import etree
  14. sys.path.append(os.getcwd())
  15. from application.items import VideoItem
  16. from application.pipeline import PiaoQuanPipeline
  17. from application.common.messageQueue import MQ
  18. from application.common.proxies import tunnel_proxies
  19. from application.common.log import AliyunLogger
  20. class KuaiShouSearch(object):
  21. """
  22. 快手 Search
  23. """
  24. def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
  25. self.platform = platform
  26. self.mode = mode
  27. self.rule_dict = rule_dict
  28. self.user_list = user_list
  29. self.env = env
  30. self.download_cnt = 0
  31. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  32. self.expire_flag = False
  33. self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
  34. def search_videos(self, keyword):
  35. """
  36. search, 一次搜索只抓 20 条视频
  37. :param keyword: 关键词
  38. :return: video_list
  39. """
  40. url = 'https://www.kuaishou.com/graphql'
  41. headers = {
  42. 'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8',
  43. 'Connection': 'keep-alive',
  44. 'Cookie': 'kpf=PC_WEB; clientid=3; did=web_5db53a9e49dca57728b58cecb7863868; didv=1698736264000; kpn=KUAISHOU_VISION',
  45. 'Origin': 'https://www.kuaishou.com',
  46. 'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E8%80%81%E5%B9%B4%E5%A4%A7%E5%AD%A6',
  47. 'Sec-Fetch-Dest': 'empty',
  48. 'Sec-Fetch-Mode': 'cors',
  49. 'Sec-Fetch-Site': 'same-origin',
  50. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
  51. 'accept': '*/*',
  52. 'content-type': 'application/json',
  53. 'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
  54. 'sec-ch-ua-mobile': '?0',
  55. 'sec-ch-ua-platform': '"macOS"',
  56. }
  57. data = {
  58. "operationName": "visionSearchPhoto",
  59. "variables": {
  60. "keyword": keyword,
  61. "pcursor": "",
  62. "page": "search"
  63. },
  64. "query": """
  65. fragment photoContent on PhotoEntity {
  66. __typename
  67. id
  68. duration
  69. caption
  70. originCaption
  71. likeCount
  72. viewCount
  73. commentCount
  74. realLikeCount
  75. coverUrl
  76. photoUrl
  77. photoH265Url
  78. manifest
  79. manifestH265
  80. videoResource
  81. coverUrls {
  82. url
  83. __typename
  84. }
  85. timestamp
  86. expTag
  87. animatedCoverUrl
  88. distance
  89. videoRatio
  90. liked
  91. stereoType
  92. profileUserTopPhoto
  93. musicBlocked
  94. riskTagContent
  95. riskTagUrl
  96. }
  97. fragment recoPhotoFragment on recoPhotoEntity {
  98. __typename
  99. id
  100. duration
  101. caption
  102. originCaption
  103. likeCount
  104. viewCount
  105. commentCount
  106. realLikeCount
  107. coverUrl
  108. photoUrl
  109. photoH265Url
  110. manifest
  111. manifestH265
  112. videoResource
  113. coverUrls {
  114. url
  115. __typename
  116. }
  117. timestamp
  118. expTag
  119. animatedCoverUrl
  120. distance
  121. videoRatio
  122. liked
  123. stereoType
  124. profileUserTopPhoto
  125. musicBlocked
  126. riskTagContent
  127. riskTagUrl
  128. }
  129. fragment feedContent on Feed {
  130. type
  131. author {
  132. id
  133. name
  134. headerUrl
  135. following
  136. headerUrls {
  137. url
  138. __typename
  139. }
  140. __typename
  141. }
  142. photo {
  143. ...photoContent
  144. ...recoPhotoFragment
  145. __typename
  146. }
  147. canAddComment
  148. llsid
  149. status
  150. currentPcursor
  151. tags {
  152. type
  153. name
  154. __typename
  155. }
  156. __typename
  157. }
  158. query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
  159. visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
  160. result
  161. llsid
  162. webPageArea
  163. feeds {
  164. ...feedContent
  165. __typename
  166. }
  167. searchSessionId
  168. pcursor
  169. aladdinBanner {
  170. imgUrl
  171. link
  172. __typename
  173. }
  174. __typename
  175. }
  176. }
  177. """
  178. }
  179. response = requests.post(url, headers=headers, json=data).json()
  180. video_list = response['data']['visionSearchPhoto']['feeds']
  181. return video_list
  182. def process_video_obj(self, video_obj):
  183. """
  184. 处理视频信息
  185. :return:
  186. """
  187. # print(json.dumps(video_obj, ensure_ascii=False, indent=4))
  188. trace_id = self.platform + str(uuid.uuid1())
  189. our_user = random.choice(self.user_list)
  190. publish_time_stamp = int(video_obj["photo"]["timestamp"] / 1000)
  191. item = VideoItem()
  192. item.add_video_info("user_id", our_user["uid"])
  193. item.add_video_info("user_name", our_user["nick_name"])
  194. item.add_video_info("video_id", video_obj["photo"]["manifest"]["videoId"])
  195. item.add_video_info("video_title", video_obj["photo"]['caption'])
  196. # item.add_video_info("publish_time_str", video_obj["photo"]['timestamp'])
  197. item.add_video_info("publish_time_stamp", int(publish_time_stamp))
  198. item.add_video_info("video_url", video_obj["photo"]['manifest']['adaptationSet'][0]['representation'][0]['url'])
  199. item.add_video_info(
  200. "cover_url", video_obj["photo"]["coverUrl"]
  201. )
  202. item.add_video_info("like_cnt", video_obj["photo"]["realLikeCount"])
  203. item.add_video_info("play_cnt", video_obj["photo"]["viewCount"])
  204. item.add_video_info("out_video_id", video_obj["photo"]["manifest"]["videoId"])
  205. item.add_video_info("platform", self.platform)
  206. item.add_video_info("strategy", self.mode)
  207. item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
  208. mq_obj = item.produce_item()
  209. print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
  210. if __name__ == '__main__':
  211. KS = KuaiShouSearch(platform="kuaishou", mode="search", rule_dict={}, user_list=[{"uid": 1, "nick_name": "ljh"}])
  212. video_list = KS.search_videos("王者荣耀")
  213. for i in video_list:
  214. KS.process_video_obj(i)