kuaishou_author_scheduling_new.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. # -*- coding: utf-8 -*-
  2. # @Time: 2023/11/07
  3. import os
  4. import random
  5. import sys
  6. import time
  7. from datetime import date, timedelta
  8. import requests
  9. import json
  10. import urllib3
  11. sys.path.append(os.getcwd())
  12. from common.common import Common
  13. from common import AliyunLogger
  14. from common.mq import MQ
  15. from requests.adapters import HTTPAdapter
  16. from common.scheduling_db import MysqlHelper
  17. from common.public import random_title, get_config_from_mysql, download_rule
  18. class KuaishouauthorScheduling:
  19. platform = "快手"
  20. download_cnt = 0
  21. @classmethod
  22. def videos_cnt(cls, rule_dict):
  23. videos_cnt = rule_dict.get("videos_cnt", {}).get("min", 0)
  24. if videos_cnt == 0:
  25. videos_cnt = 1000
  26. return videos_cnt
  27. @classmethod
  28. def video_title(cls, log_type, crawler, env, title):
  29. title_split1 = title.split(" #")
  30. if title_split1[0] != "":
  31. title1 = title_split1[0]
  32. else:
  33. title1 = title_split1[-1]
  34. title_split2 = title1.split(" #")
  35. if title_split2[0] != "":
  36. title2 = title_split2[0]
  37. else:
  38. title2 = title_split2[-1]
  39. title_split3 = title2.split("@")
  40. if title_split3[0] != "":
  41. title3 = title_split3[0]
  42. else:
  43. title3 = title_split3[-1]
  44. video_title = title3.strip().replace("\n", "") \
  45. .replace("/", "").replace("快手", "").replace(" ", "") \
  46. .replace(" ", "").replace("&NBSP", "").replace("\r", "") \
  47. .replace("#", "").replace(".", "。").replace("\\", "") \
  48. .replace(":", "").replace("*", "").replace("?", "") \
  49. .replace("?", "").replace('"', "").replace("<", "") \
  50. .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
  51. if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
  52. return random_title(log_type, crawler, env, text='title')
  53. else:
  54. return video_title
  55. @classmethod
  56. def get_cookie(cls, log_type, crawler, env):
  57. select_sql = f""" select * from crawler_config where source="{crawler}" """
  58. configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  59. for config in configs:
  60. if "cookie" in config["config"]:
  61. cookie_dict = {
  62. "cookie_id": config["id"],
  63. "title": config["title"].strip(),
  64. "cookie": dict(eval(config["config"]))["cookie"].strip(),
  65. "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(config["update_time"] / 1000))),
  66. "operator": config["operator"].strip()
  67. }
  68. return cookie_dict
  69. @classmethod
  70. def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
  71. pcursor = ""
  72. mq = MQ(topic_name="topic_crawler_etl_" + env)
  73. while True:
  74. time.sleep(random.randint(10, 50))
  75. url = "https://www.kuaishou.com/graphql"
  76. payload = json.dumps({
  77. "operationName": "visionProfilePhotoList",
  78. "variables": {
  79. "userId": user_dict["link"].replace("https://www.kuaishou.com/profile/", ""),
  80. "pcursor": pcursor,
  81. "page": "profile"
  82. },
  83. "query": "fragment photoContent on PhotoEntity {\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n __typename\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
  84. })
  85. cookie = cls.get_cookie(log_type, crawler, env)["cookie"]
  86. headers = {
  87. 'Accept': '*/*',
  88. 'Content-Type': 'application/json',
  89. 'Origin': 'https://www.kuaishou.com',
  90. 'Cookie': cookie,
  91. 'Content-Length': '1260',
  92. 'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
  93. 'Host': 'www.kuaishou.com',
  94. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15',
  95. 'Referer': f'https://www.kuaishou.com/profile/{user_dict["link"].replace("https://www.kuaishou.com/profile/", "")}',
  96. 'Accept-Encoding': 'gzip, deflate, br',
  97. 'Connection': 'keep-alive'
  98. }
  99. urllib3.disable_warnings()
  100. s = requests.session()
  101. # max_retries=3 重试3次
  102. s.mount('http://', HTTPAdapter(max_retries=3))
  103. s.mount('https://', HTTPAdapter(max_retries=3))
  104. response = s.post(url=url, headers=headers, data=payload, proxies=Common.tunnel_proxies(), verify=False, timeout=10)
  105. response.close()
  106. # Common.logger(log_type, crawler).info(f"response:{response.text}\n")
  107. if response.status_code != 200:
  108. Common.logger(log_type, crawler).warning(f"response:{response.text}\n")
  109. AliyunLogger.logging(
  110. code="2000",
  111. platform=crawler,
  112. mode=log_type,
  113. env=env,
  114. message=f"response:{response.json()}\n"
  115. )
  116. return
  117. elif "data" not in response.json():
  118. Common.logger(log_type, crawler).warning(f"response:{response.json()}\n")
  119. AliyunLogger.logging(
  120. code="2000",
  121. platform=crawler,
  122. mode=log_type,
  123. env=env,
  124. message=f"response:{response.json()}\n"
  125. )
  126. return
  127. elif "visionProfilePhotoList" not in response.json()["data"]:
  128. Common.logger(log_type, crawler).warning(f"response:{response.json()}\n")
  129. AliyunLogger.logging(
  130. code="2000",
  131. platform=crawler,
  132. mode=log_type,
  133. env=env,
  134. message=f"response:{response.json()}\n"
  135. )
  136. return
  137. elif "feeds" not in response.json()["data"]["visionProfilePhotoList"]:
  138. Common.logger(log_type, crawler).warning(f"response:{response.json()}\n")
  139. AliyunLogger.logging(
  140. code="2000",
  141. platform=crawler,
  142. mode=log_type,
  143. env=env,
  144. message=f"response:{response.json()}\n"
  145. )
  146. return
  147. elif len(response.json()["data"]["visionProfilePhotoList"]["feeds"]) == 0:
  148. Common.logger(log_type, crawler).warning(f"没有更多视频啦 ~\n")
  149. AliyunLogger.logging(
  150. code="2001",
  151. platform=crawler,
  152. mode=log_type,
  153. env=env,
  154. message= f"没有更多视频啦 ~\n"
  155. )
  156. return
  157. pcursor = response.json()['data']['visionProfilePhotoList']['pcursor']
  158. feeds = response.json()['data']['visionProfilePhotoList']['feeds']
  159. for i in range(len(feeds)):
  160. try:
  161. Common.logger(log_type, crawler).info('扫描到一条视频\n')
  162. AliyunLogger.logging(
  163. code="1001",
  164. platform=crawler,
  165. mode=log_type,
  166. env=env,
  167. message='扫描到一条视频\n'
  168. )
  169. if cls.download_cnt >= cls.videos_cnt(rule_dict):
  170. Common.logger(log_type, crawler).info(f"已下载视频数:{cls.download_cnt}\n")
  171. AliyunLogger.logging(
  172. code="2002",
  173. platform=crawler,
  174. mode=log_type,
  175. env=env,
  176. message=f"已下载视频数:{cls.download_cnt}\n"
  177. )
  178. return
  179. video_title = feeds[i].get("photo", {}).get("caption", random_title(log_type, crawler, env, text='title'))
  180. video_title = cls.video_title(log_type, crawler, env, video_title)
  181. try:
  182. video_id = feeds[i].get("photo", {}).get("videoResource").get("h264", {}).get("videoId", "")
  183. video_width = feeds[i].get("photo", {}).get("videoResource").get("h264", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("width", 0)
  184. video_height = feeds[i].get("photo", {}).get("videoResource").get("h264", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("height", 0)
  185. except KeyError:
  186. video_id = feeds[i].get("photo", {}).get("videoResource").get("hevc", {}).get("videoId", "")
  187. video_width = feeds[i].get("photo", {}).get("videoResource").get("hevc", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("width", 0)
  188. video_height = feeds[i].get("photo", {}).get("videoResource").get("hevc", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("height", 0)
  189. publish_time_stamp = int(int(feeds[i].get('photo', {}).get('timestamp', 0)) / 1000)
  190. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  191. date_three_days_ago_string = (date.today() + timedelta(days=-5)).strftime("%Y-%m-%d %H:%M:%S")
  192. rule = publish_time_str > date_three_days_ago_string
  193. if i > 2:
  194. if rule == False:
  195. break
  196. if rule == False:
  197. Common.logger(log_type, crawler).info(f"发布时间小于5天,发布时间:{publish_time_str}\n")
  198. AliyunLogger.logging(
  199. code="2004",
  200. platform=crawler,
  201. mode=log_type,
  202. env=env,
  203. message=f"发布时间小于5天,发布时间:{publish_time_str}\n"
  204. )
  205. continue
  206. viewCount = int(feeds[i].get('photo', {}).get('viewCount', 0))
  207. realLikeCount = int(feeds[i].get('photo', {}).get('realLikeCount', 0))
  208. video_percent = '%.2f' % (realLikeCount / viewCount)
  209. if viewCount < 100000:
  210. if float(video_percent) < 0.01:
  211. Common.logger(log_type, crawler).info(f"不符合条件:点赞/播放-{video_percent},播放量-{viewCount}\n")
  212. AliyunLogger.logging(
  213. code="2004",
  214. platform=crawler,
  215. mode=log_type,
  216. env=env,
  217. message=f"点赞量:{realLikeCount}\n"
  218. )
  219. continue
  220. video_dict = {'video_title': video_title,
  221. 'video_id': video_id,
  222. 'play_cnt': int(feeds[i].get('photo', {}).get('viewCount', 0)),
  223. 'like_cnt': int(feeds[i].get('photo', {}).get('realLikeCount', 0)),
  224. 'comment_cnt': 0,
  225. 'share_cnt': 0,
  226. 'video_width': video_width,
  227. 'video_height': video_height,
  228. 'duration': int(int(feeds[i].get('photo', {}).get('duration', 0)) / 1000),
  229. 'publish_time_stamp': publish_time_stamp,
  230. 'publish_time_str': publish_time_str,
  231. 'user_name': feeds[i].get('author', {}).get('name', ""),
  232. 'user_id': feeds[i].get('author', {}).get('id', ""),
  233. 'avatar_url': feeds[i].get('author', {}).get('headerUrl', ""),
  234. 'cover_url': feeds[i].get('photo', {}).get('coverUrl', ""),
  235. 'video_url': feeds[i].get('photo', {}).get('photoUrl', ""),
  236. 'session': f"kuaishou-{int(time.time())}"}
  237. for k, v in video_dict.items():
  238. Common.logger(log_type, crawler).info(f"{k}:{v}")
  239. AliyunLogger.logging(
  240. code="1000",
  241. platform=crawler,
  242. mode=log_type,
  243. env=env,
  244. message=f"{video_dict}\n"
  245. )
  246. if int((int(time.time()) - int(publish_time_stamp)) / (3600*24)) > int(rule_dict.get("period", {}).get("max", 1000)):
  247. Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
  248. AliyunLogger.logging(
  249. code="2004",
  250. platform=crawler,
  251. mode=log_type,
  252. env=env,
  253. message=f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n'
  254. )
  255. return
  256. if video_dict["video_id"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
  257. Common.logger(log_type, crawler).info('无效视频\n')
  258. AliyunLogger.logging(
  259. code="2004",
  260. platform=crawler,
  261. mode=log_type,
  262. env=env,
  263. message='无效视频\n'
  264. )
  265. elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
  266. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  267. AliyunLogger.logging(
  268. code="2004",
  269. platform=crawler,
  270. mode=log_type,
  271. env=env,
  272. message='不满足抓取规则\n'
  273. )
  274. elif any(str(word) if str(word) in video_dict["video_title"] else False
  275. for word in get_config_from_mysql(log_type=log_type,
  276. source=crawler,
  277. env=env,
  278. text="filter",
  279. action="")) is True:
  280. Common.logger(log_type, crawler).info('已中过滤词\n')
  281. AliyunLogger.logging(
  282. code="2004",
  283. platform=crawler,
  284. mode=log_type,
  285. env=env,
  286. message='已中过滤词\n'
  287. )
  288. elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
  289. Common.logger(log_type, crawler).info('视频已下载\n')
  290. AliyunLogger.logging(
  291. code="2002",
  292. platform=crawler,
  293. mode=log_type,
  294. env=env,
  295. message='视频已下载\n'
  296. )
  297. else:
  298. video_dict["out_user_id"] = video_dict["user_id"]
  299. video_dict["platform"] = crawler
  300. video_dict["strategy"] = log_type
  301. video_dict["out_video_id"] = video_dict["video_id"]
  302. video_dict["width"] = video_dict["video_width"]
  303. video_dict["height"] = video_dict["video_height"]
  304. video_dict["crawler_rule"] = json.dumps(rule_dict)
  305. video_dict["user_id"] = user_dict["uid"]
  306. video_dict["publish_time"] = video_dict["publish_time_str"]
  307. video_dict["strategy_type"] = log_type
  308. mq.send_msg(video_dict)
  309. cls.download_cnt += 1
  310. except Exception as e:
  311. Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
  312. AliyunLogger.logging(
  313. code="3000",
  314. platform=crawler,
  315. mode=log_type,
  316. env=env,
  317. message=f"抓取单条视频异常:{e}\n"
  318. )
  319. @classmethod
  320. def repeat_video(cls, log_type, crawler, video_id, env):
  321. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
  322. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  323. return len(repeat_video)
  324. @classmethod
  325. def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
  326. for user_dict in user_list:
  327. try:
  328. Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 主页视频")
  329. AliyunLogger.logging(
  330. code="2000",
  331. platform=crawler,
  332. mode=log_type,
  333. env=env,
  334. message=f"开始抓取 {user_dict['nick_name']} 主页视频"
  335. )
  336. cls.download_cnt = 0
  337. cls.get_videoList(log_type=log_type,
  338. crawler=crawler,
  339. user_dict=user_dict,
  340. rule_dict=rule_dict,
  341. env=env)
  342. except Exception as e:
  343. Common.logger(log_type, crawler).warning(f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
  344. AliyunLogger.logging(
  345. code="3000",
  346. platform=crawler,
  347. mode=log_type,
  348. env=env,
  349. message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
  350. )
  351. if __name__ == "__main__":
  352. print(KuaishouauthorScheduling.get_cookie("author", "kuaishou", "prod")["cookie"])
  353. pass