douyin_author_scheduling_new.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. # -*- coding: utf-8 -*-
  2. # @Time: 2023/11/07
  3. import os
  4. import random
  5. import sys
  6. import time
  7. import requests
  8. import json
  9. import urllib3
  10. sys.path.append(os.getcwd())
  11. from datetime import timedelta, date
  12. from common.common import Common
  13. from common import AliyunLogger
  14. from common.mq import MQ
  15. from requests.adapters import HTTPAdapter
  16. from common.scheduling_db import MysqlHelper
  17. from common.public import get_config_from_mysql, download_rule
  18. from douyin.douyin_author.douyin_author_scheduling_help import DouYinHelper
  19. from common.limit import AuthorLimit
  20. class DouyinauthorScheduling:
  21. platform = "抖音"
  22. download_cnt = 0
  23. limiter = AuthorLimit(platform="douyin", mode="author")
  24. @classmethod
  25. def videos_cnt(cls, rule_dict):
  26. videos_cnt = rule_dict.get("videos_cnt", {}).get("min", 0)
  27. if videos_cnt == 0:
  28. videos_cnt = 1000
  29. return videos_cnt
  30. @classmethod
  31. def get_cookie(cls, log_type, crawler, env):
  32. select_sql = f""" select * from crawler_config where source="{crawler}" """
  33. configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  34. for config in configs:
  35. if "cookie" in config["config"]:
  36. cookie_dict = {
  37. "cookie_id": config["id"],
  38. "title": config["title"].strip(),
  39. "cookie": dict(eval(config["config"]))["cookie"].strip(),
  40. "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(config["update_time"] / 1000))),
  41. "operator": config["operator"].strip()
  42. }
  43. return cookie_dict
  44. @classmethod
  45. def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
  46. mq = MQ(topic_name="topic_crawler_etl_" + env)
  47. next_cursor = 0
  48. special = 0
  49. for i in range(3):
  50. # while True:
  51. flag = user_dict["link"].split("_")[0]
  52. if flag == "V1":
  53. special = 0.01
  54. rule_dict = {
  55. "like_cnt": {"min": 10000, "max": 0},
  56. 'period': {"min": 90, "max": 90},
  57. }
  58. elif flag == "V2":
  59. special = 0.01
  60. rule_dict = {
  61. "like_cnt": {"min": 2000, "max": 0},
  62. 'period': {"min": 90, "max": 90},
  63. }
  64. elif flag == "V3":
  65. special = 0.01
  66. rule_dict = {
  67. "like_cnt": {"min": 100, "max": 0},
  68. 'period': {"min": 90, "max": 90},
  69. }
  70. cookie = cls.get_cookie(log_type, crawler, env)["cookie"]
  71. if user_dict['link'][0] == "V":
  72. link = user_dict["link"][3:]
  73. else:
  74. link = user_dict["link"]
  75. time.sleep(random.randint(5, 10))
  76. url = 'https://www.douyin.com/aweme/v1/web/aweme/post/'
  77. account_id = link
  78. headers = {
  79. 'Accept': 'application/json, text/plain, */*',
  80. 'Accept-Language': 'zh-CN,zh;q=0.9',
  81. 'Cache-Control': 'no-cache',
  82. 'Cookie': cookie,
  83. # 'Cookie': "ttwid=1%7Cyj16cpJ4yxvUv9QWry1Uz3MoZ3Ci7FHGQR4qW3W70ac%7C1704436922%7C435637f1aa3c55cbed4587acf02003b5d74cfcac945a0df3893e041a288ce3c1; bd_ticket_guard_client_web_domain=2; passport_assist_user=CjzcKtls0e65w_tjpggJoAB9du8ZDR8XRxt178-cIHsJhxCRZPLxqAj0PHWKZ4g2xmxWzTHsK7mi4vxt0lcaSgo8W1SZlyQoj2vcxlToyotQ902cRuWULW6HqkHEJHMRcIoo_Y7maHi82HqNSTCVE5xBSQnTOXW31hxsJ4EIENPsxQ0Yia_WVCABIgED619Mew%3D%3D; n_mh=uPso8EqWH8OYYER0xnVFOgB1e9TbTzK9J1CBmr4IQVA; sso_uid_tt=f829ccc6652eae601ff8e56da1fccdb5; sso_uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; toutiao_sso_user=d2fa09f7626319fb35fd2553b5ec5b76; toutiao_sso_user_ss=d2fa09f7626319fb35fd2553b5ec5b76; LOGIN_STATUS=1; store-region=cn-hn; store-region-src=uid; d_ticket=dd5890b4b8f873453c1f1a090b9aa6ccb205c; sid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; ssid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; dy_swidth=1449; dy_sheight=906; __live_version__=%221.1.1.8009%22; live_use_vvc=%22false%22; xgplayer_user_id=510446933624; uid_tt=f829ccc6652eae601ff8e56da1fccdb5; uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; sid_tt=d2fa09f7626319fb35fd2553b5ec5b76; sessionid=d2fa09f7626319fb35fd2553b5ec5b76; sessionid_ss=d2fa09f7626319fb35fd2553b5ec5b76; passport_csrf_token=34235b71f9c981e07032bd9041848f1e; passport_csrf_token_default=34235b71f9c981e07032bd9041848f1e; download_guide=%223%2F20240313%2F1%22; publish_badge_show_info=%220%2C0%2C0%2C1710488198823%22; EnhanceDownloadGuide=%220_0_0_0_2_1710734032%22; sid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; ssid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; sid_guard=d2fa09f7626319fb35fd2553b5ec5b76%7C1710841689%7C5184001%7CSat%2C+18-May-2024+09%3A48%3A10+GMT; __ac_nonce=065fb9e6800dd2b1e14b1; __ac_signature=_02B4Z6wo00f01l39XCgAAIDBYFRGtw.YiKZd3ViAAPKTmnfg2zaxyzrXD6iNtRPPtcoSm5zbE6snYTcix8FTXgxsxQK195O6vG-zEOdZqKTq-ouYFPANlN1Jmu1.ZxBLTzOstKAOorrHEYQN06; douyin.com; xg_device_score=7.654580937785368; device_web_cpu_core=16; device_web_memory_size=8; architecture=amd64; csrf_session_id=f524ab33e8de0e4e922d8b48c362e6c1; strategyABtestKey=%221710988910.691%22; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.281%7D; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A1%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A1%7D%22; my_rd=2; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710989305107%2F0%22; SEARCH_RESULT_LIST_TYPE=%22single%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1449%2C%5C%22screen_height%5C%22%3A906%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A50%7D%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCT05aUWpZcmNjMWhVYXhidlVoUG9uUC9lV0phYzBNbnhTQldxUmZESGFZQ290cUhOSE1GdmJ2ZTdSY1REdVpiemdHUU82cS90dWhzNVdnTmxaeVR3TzQ9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; tt_scid=IMdpIr4sRF90L0IaD0TdSlOy0Sm1rX-hlw5-OAxNAcisxsztezRzg3356KIGHx4cee78; pwa2=%220%7C0%7C1%7C0%22; odin_tt=f05b7460c2544b994f5deae19a5bbf0828870c64564040ef36c9d7cb40da9e44bc41ee52b1cac76d042b80fc4dcb4394; msToken=Tq7-Wv99mG0yhHDIz7-R1fxSAQyf8R7dNAvHMxnjrbWpbi531L8TI6VdQhQSDTAl8jQQJr9IWhJpbRu3E01IgC5uQ7DE_5oGYW046WpPb_bjluz255YhMdqfJ3Qmeg==; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710990128301%2F0%22; msToken=YstZKHMONS09-8nDsHM40jwWV2nr5E1wYmv7cBeAmeY02prkpNLjRwB8C3tp52nc1hxvL5R1F-hkmvDSc0TNeNxz-DNodK3GMV8dK3gkVT8DVPKeVL5umskY5Am5; passport_fe_beating_status=false; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22",
  84. 'Pragma': 'no-cache',
  85. 'Referer': f'https://www.douyin.com/user/{account_id}',
  86. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  87. 'Chrome/118.0.0.0 Safari/537.36',
  88. }
  89. query = DouYinHelper.get_full_query(ua=headers['User-Agent'], extra_data={
  90. 'sec_user_id': account_id,
  91. 'max_cursor': next_cursor,
  92. 'locate_query': 'false',
  93. 'show_live_replay_strategy': '1',
  94. 'need_time_list': '1',
  95. 'time_list_query': '0',
  96. 'whale_cut_token': '',
  97. 'cut_version': '1',
  98. 'count': '18',
  99. 'publish_video_strategy_type': '2',
  100. })
  101. urllib3.disable_warnings()
  102. s = requests.session()
  103. # max_retries=3 重试3次
  104. s.mount('http://', HTTPAdapter(max_retries=3))
  105. s.mount('https://', HTTPAdapter(max_retries=3))
  106. response = requests.request(method='GET', url=url, headers=headers, params=query)
  107. body = response.content.decode()
  108. obj = json.loads(body)
  109. has_more = True if obj.get('has_more', 0) == 1 else False
  110. next_cursor = str(obj.get('max_cursor')) if has_more else None
  111. data = obj.get('aweme_list', [])
  112. response.close()
  113. if response.status_code != 200:
  114. Common.logger(log_type, crawler).warning(f"data:{data}\n")
  115. AliyunLogger.logging(
  116. code="2000",
  117. platform=crawler,
  118. mode=log_type,
  119. env=env,
  120. message=f"data:{data}\n"
  121. )
  122. return
  123. elif len(data) == 0:
  124. Common.logger(log_type, crawler).warning(f"没有更多视频啦 ~\n")
  125. AliyunLogger.logging(
  126. code="2001",
  127. platform=crawler,
  128. mode=log_type,
  129. env=env,
  130. message=f"没有更多视频啦 ~\n"
  131. )
  132. return
  133. for i in range(len(data)):
  134. try:
  135. entity_type = data[i].get('search_impr').get('entity_type')
  136. if entity_type == 'GENERAL':
  137. Common.logger(log_type, crawler).info('扫描到一条视频\n')
  138. AliyunLogger.logging(
  139. code="1001",
  140. platform=crawler,
  141. mode=log_type,
  142. env=env,
  143. message='扫描到一条视频\n'
  144. )
  145. is_top = data[i].get('is_top') # 是否置顶
  146. video_id = data[i].get('aweme_id') # 文章id
  147. video_title = data[i].get('desc', "").strip().replace("\n", "") \
  148. .replace("/", "").replace("\\", "").replace("\r", "") \
  149. .replace(":", "").replace("*", "").replace("?", "") \
  150. .replace("?", "").replace('"', "").replace("<", "") \
  151. .replace(">", "").replace("|", "").replace(" ", "") \
  152. .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
  153. .replace("'", "").replace("#", "").replace("Merge", "")
  154. publish_time_stamp = data[i].get('create_time') # 发布时间
  155. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  156. video_url = data[i].get('video').get('play_addr').get('url_list')[0] # 视频链接
  157. cover_url = data[i].get('video').get('cover').get('url_list')[0] # 视频封面
  158. digg_count = int(data[i].get('statistics').get('digg_count')) # 点赞
  159. comment_count = int(data[i].get('statistics').get('comment_count')) # 评论
  160. # collect_count = data[i].get('statistics').get('collect_count') # 收藏
  161. share_count = int(data[i].get('statistics').get('share_count')) # 转发
  162. if special != 0:
  163. if share_count != 0:
  164. video_percent = '%.2f' % (share_count / digg_count)
  165. special = float(special)
  166. if float(video_percent) < special:
  167. Common.logger(log_type, crawler).info(f"不符合条件:分享/点赞-{video_percent}\n")
  168. AliyunLogger.logging(
  169. code="2004",
  170. platform=crawler,
  171. mode=log_type,
  172. env=env,
  173. message=f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n"
  174. )
  175. continue
  176. video_dict = {'video_title': video_title,
  177. 'video_id': video_id,
  178. 'play_cnt': 0,
  179. 'like_cnt': digg_count,
  180. 'comment_cnt': comment_count,
  181. 'share_cnt': share_count,
  182. 'video_width': 0,
  183. 'video_height': 0,
  184. 'duration': 0,
  185. 'publish_time_stamp': publish_time_stamp,
  186. 'publish_time_str': publish_time_str,
  187. 'user_name': "douyin",
  188. 'user_id': video_id,
  189. 'avatar_url': '',
  190. 'cover_url': cover_url,
  191. 'video_url': video_url,
  192. 'session': f"douyin-{int(time.time())}"}
  193. for k, v in video_dict.items():
  194. Common.logger(log_type, crawler).info(f"{k}:{v}")
  195. AliyunLogger.logging(
  196. code="1000",
  197. platform=crawler,
  198. mode=log_type,
  199. env=env,
  200. message=f"{video_dict}\n"
  201. )
  202. if is_top == 0:
  203. if int((int(time.time()) - int(publish_time_stamp)) / (3600*24)) > int(rule_dict.get("period", {}).get("max", 1000)):
  204. Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
  205. AliyunLogger.logging(
  206. code="2004",
  207. platform=crawler,
  208. mode=log_type,
  209. env=env,
  210. message=f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n'
  211. )
  212. return
  213. if video_dict["video_id"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
  214. Common.logger(log_type, crawler).info('无效视频\n')
  215. AliyunLogger.logging(
  216. code="2004",
  217. platform=crawler,
  218. mode=log_type,
  219. env=env,
  220. message='无效视频\n'
  221. )
  222. elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
  223. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  224. AliyunLogger.logging(
  225. code="2004",
  226. platform=crawler,
  227. mode=log_type,
  228. env=env,
  229. message='不满足抓取规则\n'
  230. )
  231. elif any(str(word) if str(word) in video_dict["video_title"] else False
  232. for word in get_config_from_mysql(log_type=log_type,
  233. source=crawler,
  234. env=env,
  235. text="filter",
  236. action="")) is True:
  237. Common.logger(log_type, crawler).info('已中过滤词\n')
  238. AliyunLogger.logging(
  239. code="2004",
  240. platform=crawler,
  241. mode=log_type,
  242. env=env,
  243. message='已中过滤词\n'
  244. )
  245. elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
  246. Common.logger(log_type, crawler).info('视频已下载\n')
  247. AliyunLogger.logging(
  248. code="2002",
  249. platform=crawler,
  250. mode=log_type,
  251. env=env,
  252. message='视频已下载\n'
  253. )
  254. else:
  255. video_dict["out_user_id"] = video_dict["user_id"]
  256. video_dict["platform"] = crawler
  257. video_dict["strategy"] = log_type
  258. video_dict["out_video_id"] = video_dict["video_id"]
  259. video_dict["width"] = video_dict["video_width"]
  260. video_dict["height"] = video_dict["video_height"]
  261. video_dict["crawler_rule"] = json.dumps(rule_dict)
  262. video_dict["user_id"] = user_dict["uid"]
  263. video_dict["publish_time"] = video_dict["publish_time_str"]
  264. video_dict["strategy_type"] = log_type
  265. limit_flag = cls.limiter.author_limitation(user_id=video_dict['user_id'])
  266. if limit_flag:
  267. mq.send_msg(video_dict)
  268. cls.download_cnt += 1
  269. except Exception as e:
  270. Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
  271. AliyunLogger.logging(
  272. code="3000",
  273. platform=crawler,
  274. mode=log_type,
  275. env=env,
  276. message=f"抓取单条视频异常:{e}\n"
  277. )
  278. @classmethod
  279. def repeat_video(cls, log_type, crawler, video_id, env):
  280. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
  281. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  282. return len(repeat_video)
  283. @classmethod
  284. def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
  285. for user_dict in user_list:
  286. try:
  287. Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 主页视频")
  288. AliyunLogger.logging(
  289. code="2000",
  290. platform=crawler,
  291. mode=log_type,
  292. env=env,
  293. message=f"开始抓取 {user_dict['nick_name']} 主页视频"
  294. )
  295. cls.download_cnt = 0
  296. cls.get_videoList(log_type=log_type,
  297. crawler=crawler,
  298. user_dict=user_dict,
  299. rule_dict=rule_dict,
  300. env=env)
  301. except Exception as e:
  302. Common.logger(log_type, crawler).warning(f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
  303. AliyunLogger.logging(
  304. code="3000",
  305. platform=crawler,
  306. mode=log_type,
  307. env=env,
  308. message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
  309. )
  310. if __name__ == "__main__":
  311. print(DouyinauthorScheduling.get_cookie("author", "douyin", "prod")["cookie"])
  312. pass