kuaishou_author_scheduling.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/5/24
  4. import os
  5. import shutil
  6. import sys
  7. import time
  8. from hashlib import md5
  9. import requests
  10. import json
  11. import urllib3
  12. from requests.adapters import HTTPAdapter
  13. from common.mq import MQ
  14. sys.path.append(os.getcwd())
  15. from common.common import Common
  16. from common.feishu import Feishu
  17. from common.scheduling_db import MysqlHelper
  18. from common.publish import Publish
  19. from common.public import random_title, get_config_from_mysql, download_rule
  20. class KuaishouauthorScheduling:
  21. platform = "快手"
  22. download_cnt = 0
  23. @classmethod
  24. def videos_cnt(cls, rule_dict):
  25. videos_cnt = rule_dict.get("videos_cnt", {}).get("min", 0)
  26. if videos_cnt == 0:
  27. videos_cnt = 1000
  28. return videos_cnt
  29. @classmethod
  30. def video_title(cls, log_type, crawler, env, title):
  31. title_split1 = title.split(" #")
  32. if title_split1[0] != "":
  33. title1 = title_split1[0]
  34. else:
  35. title1 = title_split1[-1]
  36. title_split2 = title1.split(" #")
  37. if title_split2[0] != "":
  38. title2 = title_split2[0]
  39. else:
  40. title2 = title_split2[-1]
  41. title_split3 = title2.split("@")
  42. if title_split3[0] != "":
  43. title3 = title_split3[0]
  44. else:
  45. title3 = title_split3[-1]
  46. video_title = title3.strip().replace("\n", "") \
  47. .replace("/", "").replace("快手", "").replace(" ", "") \
  48. .replace(" ", "").replace("&NBSP", "").replace("\r", "") \
  49. .replace("#", "").replace(".", "。").replace("\\", "") \
  50. .replace(":", "").replace("*", "").replace("?", "") \
  51. .replace("?", "").replace('"', "").replace("<", "") \
  52. .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
  53. if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
  54. return random_title(log_type, crawler, env, text='title')
  55. else:
  56. return video_title
  57. @classmethod
  58. def get_cookie(cls, log_type, crawler, env):
  59. select_sql = f""" select * from crawler_config where source="{crawler}" """
  60. configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  61. for config in configs:
  62. if "cookie" in config["config"]:
  63. cookie_dict = {
  64. "cookie_id": config["id"],
  65. "title": config["title"].strip(),
  66. "cookie": dict(eval(config["config"]))["cookie"].strip(),
  67. "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(config["update_time"] / 1000))),
  68. "operator": config["operator"].strip()
  69. }
  70. # for k, v in cookie_dict.items():
  71. # print(f"{k}:{type(v)}, {v}")
  72. return cookie_dict
  73. @classmethod
  74. def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
  75. pcursor = ""
  76. mq = MQ(topic_name="topic_crawler_etl_" + env)
  77. while True:
  78. url = "https://www.kuaishou.com/graphql"
  79. payload = json.dumps({
  80. "operationName": "visionProfilePhotoList",
  81. "variables": {
  82. "userId": user_dict["link"].replace("https://www.kuaishou.com/profile/", ""),
  83. "pcursor": pcursor,
  84. "page": "profile"
  85. },
  86. "query": "fragment photoContent on PhotoEntity {\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n __typename\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
  87. })
  88. headers = {
  89. 'Accept': '*/*',
  90. 'Content-Type': 'application/json',
  91. 'Origin': 'https://www.kuaishou.com',
  92. 'Cookie': cls.get_cookie(log_type, crawler, env)["cookie"],
  93. 'Content-Length': '1260',
  94. 'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
  95. 'Host': 'www.kuaishou.com',
  96. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15',
  97. 'Referer': f'https://www.kuaishou.com/profile/{user_dict["link"].replace("https://www.kuaishou.com/profile/", "")}',
  98. 'Accept-Encoding': 'gzip, deflate, br',
  99. 'Connection': 'keep-alive'
  100. }
  101. urllib3.disable_warnings()
  102. s = requests.session()
  103. # max_retries=3 重试3次
  104. s.mount('http://', HTTPAdapter(max_retries=3))
  105. s.mount('https://', HTTPAdapter(max_retries=3))
  106. response = s.post(url=url, headers=headers, data=payload, proxies=Common.tunnel_proxies(), verify=False, timeout=10)
  107. response.close()
  108. # Common.logger(log_type, crawler).info(f"response:{response.text}\n")
  109. if response.status_code != 200:
  110. Common.logger(log_type, crawler).warning(f"response:{response.text}\n")
  111. Common.logging(log_type, crawler, env, f"response:{response.text}\n")
  112. return
  113. elif "data" not in response.json():
  114. Common.logger(log_type, crawler).warning(f"response:{response.json()}\n")
  115. Common.logging(log_type, crawler, env, f"response:{response.json()}\n")
  116. return
  117. elif "visionProfilePhotoList" not in response.json()["data"]:
  118. Common.logger(log_type, crawler).warning(f"response:{response.json()}\n")
  119. Common.logging(log_type, crawler, env, f"response:{response.json()}\n")
  120. return
  121. elif "feeds" not in response.json()["data"]["visionProfilePhotoList"]:
  122. Common.logger(log_type, crawler).warning(f"response:{response.json()}\n")
  123. Common.logging(log_type, crawler, env, f"response:{response.json()}\n")
  124. return
  125. elif len(response.json()["data"]["visionProfilePhotoList"]["feeds"]) == 0:
  126. Common.logger(log_type, crawler).warning(f"没有更多视频啦 ~\n")
  127. Common.logging(log_type, crawler, env, f"没有更多视频啦 ~\n")
  128. return
  129. pcursor = response.json()['data']['visionProfilePhotoList']['pcursor']
  130. feeds = response.json()['data']['visionProfilePhotoList']['feeds']
  131. for i in range(len(feeds)):
  132. try:
  133. if cls.download_cnt >= cls.videos_cnt(rule_dict):
  134. # if cls.download_cnt >= 2:
  135. Common.logger(log_type, crawler).info(f"已下载视频数:{cls.download_cnt}\n")
  136. Common.logging(log_type, crawler, env, f"已下载视频数:{cls.download_cnt}\n")
  137. return
  138. video_title = feeds[i].get("photo", {}).get("caption", random_title(log_type, crawler, env, text='title'))
  139. video_title = cls.video_title(log_type, crawler, env, video_title)
  140. try:
  141. video_id = feeds[i].get("photo", {}).get("videoResource").get("h264", {}).get("videoId", "")
  142. video_width = feeds[i].get("photo", {}).get("videoResource").get("h264", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("width", 0)
  143. video_height = feeds[i].get("photo", {}).get("videoResource").get("h264", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("height", 0)
  144. except KeyError:
  145. video_id = feeds[i].get("photo", {}).get("videoResource").get("hevc", {}).get("videoId", "")
  146. video_width = feeds[i].get("photo", {}).get("videoResource").get("hevc", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("width", 0)
  147. video_height = feeds[i].get("photo", {}).get("videoResource").get("hevc", {}).get("adaptationSet", {})[0].get("representation", {})[0].get("height", 0)
  148. publish_time_stamp = int(int(feeds[i].get('photo', {}).get('timestamp', 0)) / 1000)
  149. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  150. video_dict = {'video_title': video_title,
  151. 'video_id': video_id,
  152. 'play_cnt': int(feeds[i].get('photo', {}).get('viewCount', 0)),
  153. 'like_cnt': int(feeds[i].get('photo', {}).get('realLikeCount', 0)),
  154. 'comment_cnt': 0,
  155. 'share_cnt': 0,
  156. 'video_width': video_width,
  157. 'video_height': video_height,
  158. 'duration': int(int(feeds[i].get('photo', {}).get('duration', 0)) / 1000),
  159. 'publish_time_stamp': publish_time_stamp,
  160. 'publish_time_str': publish_time_str,
  161. 'user_name': feeds[i].get('author', {}).get('name', ""),
  162. 'user_id': feeds[i].get('author', {}).get('id', ""),
  163. 'avatar_url': feeds[i].get('author', {}).get('headerUrl', ""),
  164. 'cover_url': feeds[i].get('photo', {}).get('coverUrl', ""),
  165. 'video_url': feeds[i].get('photo', {}).get('photoUrl', ""),
  166. 'session': f"kuaishou-{int(time.time())}"}
  167. for k, v in video_dict.items():
  168. Common.logger(log_type, crawler).info(f"{k}:{v}")
  169. Common.logging(log_type, crawler, env, f"{video_dict}")
  170. if int((int(time.time()) - int(publish_time_stamp)) / (3600*24)) > int(rule_dict.get("period", {}).get("max", 1000)):
  171. Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
  172. Common.logging(log_type, crawler, env, f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
  173. return
  174. if video_dict["video_id"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
  175. Common.logger(log_type, crawler).info('无效视频\n')
  176. Common.logging(log_type, crawler, env, '无效视频\n')
  177. elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
  178. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  179. Common.logging(log_type, crawler, env, "不满足抓取规则\n")
  180. elif any(str(word) if str(word) in video_dict["video_title"] else False
  181. for word in get_config_from_mysql(log_type=log_type,
  182. source=crawler,
  183. env=env,
  184. text="filter",
  185. action="")) is True:
  186. Common.logger(log_type, crawler).info('已中过滤词\n')
  187. Common.logging(log_type, crawler, env, '已中过滤词\n')
  188. elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
  189. Common.logger(log_type, crawler).info('视频已下载\n')
  190. Common.logging(log_type, crawler, env, '视频已下载\n')
  191. else:
  192. video_dict["out_user_id"] = video_dict["user_id"]
  193. video_dict["platform"] = crawler
  194. video_dict["strategy"] = log_type
  195. video_dict["out_video_id"] = video_dict["video_id"]
  196. video_dict["width"] = video_dict["video_width"]
  197. video_dict["height"] = video_dict["video_height"]
  198. video_dict["crawler_rule"] = json.dumps(rule_dict)
  199. video_dict["user_id"] = user_dict["uid"]
  200. video_dict["publish_time"] = video_dict["publish_time_str"]
  201. video_dict["strategy_type"] = log_type
  202. mq.send_msg(video_dict)
  203. cls.download_cnt += 1
  204. # cls.download_publish(log_type=log_type,
  205. # crawler=crawler,
  206. # user_dict=user_dict,
  207. # video_dict=video_dict,
  208. # rule_dict=rule_dict,
  209. # env=env)
  210. except Exception as e:
  211. Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
  212. Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
  213. @classmethod
  214. def repeat_video(cls, log_type, crawler, video_id, env):
  215. # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}" """
  216. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
  217. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  218. return len(repeat_video)
  219. @classmethod
  220. def download_publish(cls, log_type, crawler, user_dict, rule_dict, video_dict, env):
  221. # 下载视频
  222. Common.download_method(log_type=log_type, crawler=crawler, text='video', title=video_dict['video_title'], url=video_dict['video_url'])
  223. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  224. try:
  225. if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
  226. # 删除视频文件夹
  227. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  228. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  229. Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
  230. return
  231. except FileNotFoundError:
  232. # 删除视频文件夹
  233. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  234. Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
  235. Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
  236. return
  237. # 下载封面
  238. Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
  239. # 保存视频信息至txt
  240. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  241. # 上传视频
  242. Common.logger(log_type, crawler).info("开始上传视频...")
  243. Common.logging(log_type, crawler, env, "开始上传视频...")
  244. if env == "dev":
  245. oss_endpoint = "out"
  246. our_video_id = Publish.upload_and_publish(log_type=log_type,
  247. crawler=crawler,
  248. strategy="定向抓取策略",
  249. our_uid=user_dict["uid"],
  250. env=env,
  251. oss_endpoint=oss_endpoint)
  252. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  253. else:
  254. oss_endpoint = "inner"
  255. our_video_id = Publish.upload_and_publish(log_type=log_type,
  256. crawler=crawler,
  257. strategy="定向抓取策略",
  258. our_uid=user_dict["uid"],
  259. env=env,
  260. oss_endpoint=oss_endpoint)
  261. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  262. if our_video_id is None:
  263. try:
  264. # 删除视频文件夹
  265. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  266. return
  267. except FileNotFoundError:
  268. return
  269. # 视频信息保存数据库
  270. insert_sql = f""" insert into crawler_video(video_id,
  271. user_id,
  272. out_user_id,
  273. platform,
  274. strategy,
  275. out_video_id,
  276. video_title,
  277. cover_url,
  278. video_url,
  279. duration,
  280. publish_time,
  281. play_cnt,
  282. crawler_rule,
  283. width,
  284. height)
  285. values({our_video_id},
  286. {user_dict["uid"]},
  287. "{video_dict['user_id']}",
  288. "{cls.platform}",
  289. "定向爬虫策略",
  290. "{video_dict['video_id']}",
  291. "{video_dict['video_title']}",
  292. "{video_dict['cover_url']}",
  293. "{video_dict['video_url']}",
  294. {int(video_dict['duration'])},
  295. "{video_dict['publish_time_str']}",
  296. {int(video_dict['play_cnt'])},
  297. '{json.dumps(rule_dict)}',
  298. {int(video_dict['video_width'])},
  299. {int(video_dict['video_height'])}) """
  300. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  301. Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
  302. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  303. Common.logger(log_type, crawler).info('视频信息写入数据库成功')
  304. Common.logging(log_type, crawler, env, '视频信息写入数据库成功')
  305. # 视频写入飞书
  306. Feishu.insert_columns(log_type, crawler, "fYdA8F", "ROWS", 1, 2)
  307. upload_time = int(time.time())
  308. values = [[our_video_id,
  309. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  310. "定向榜",
  311. str(video_dict['video_id']),
  312. video_dict['video_title'],
  313. our_video_link,
  314. video_dict['play_cnt'],
  315. video_dict['comment_cnt'],
  316. video_dict['like_cnt'],
  317. video_dict['share_cnt'],
  318. video_dict['duration'],
  319. f"{video_dict['video_width']}*{video_dict['video_height']}",
  320. video_dict['publish_time_str'],
  321. video_dict['user_name'],
  322. video_dict['user_id'],
  323. video_dict['avatar_url'],
  324. video_dict['cover_url'],
  325. video_dict['video_url']]]
  326. time.sleep(1)
  327. Feishu.update_values(log_type, crawler, "fYdA8F", "E2:Z2", values)
  328. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  329. Common.logging(log_type, crawler, env, f"视频已保存至云文档\n")
  330. cls.download_cnt += 1
  331. @classmethod
  332. def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
  333. for user_dict in user_list:
  334. try:
  335. Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 主页视频")
  336. Common.logging(log_type, crawler, env, f"开始抓取 {user_dict['nick_name']} 主页视频")
  337. cls.download_cnt = 0
  338. cls.get_videoList(log_type=log_type,
  339. crawler=crawler,
  340. user_dict=user_dict,
  341. rule_dict=rule_dict,
  342. env=env)
  343. except Exception as e:
  344. Common.logger(log_type, crawler).warning(f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
  345. Common.logging(log_type, crawler, env, f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
  346. if __name__ == "__main__":
  347. print(KuaishouauthorScheduling.get_cookie("author", "kuaishou", "prod")["cookie"])
  348. pass