douyin_author_scheduling_new.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. # -*- coding: utf-8 -*-
  2. # @Time: 2023/11/07
  3. import os
  4. import random
  5. import sys
  6. import time
  7. import requests
  8. import json
  9. import urllib3
  10. sys.path.append(os.getcwd())
  11. from datetime import timedelta, date
  12. from common.common import Common
  13. from common import AliyunLogger
  14. from common.mq import MQ
  15. from requests.adapters import HTTPAdapter
  16. from common.scheduling_db import MysqlHelper
  17. from common.public import get_config_from_mysql, download_rule
  18. from douyin.douyin_author.douyin_author_scheduling_help import DouYinHelper
  19. from common.limit import AuthorLimit
  20. class DouyinauthorScheduling:
  21. platform = "抖音"
  22. download_cnt = 0
  23. limiter = AuthorLimit(platform="douyin", mode="author")
  24. @classmethod
  25. def videos_cnt(cls, rule_dict):
  26. videos_cnt = rule_dict.get("videos_cnt", {}).get("min", 0)
  27. if videos_cnt == 0:
  28. videos_cnt = 1000
  29. return videos_cnt
  30. @classmethod
  31. def get_cookie(cls, log_type, crawler, env):
  32. select_sql = f""" select * from crawler_config where source="{crawler}" """
  33. configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  34. for config in configs:
  35. if "cookie" in config["config"]:
  36. cookie_dict = {
  37. "cookie_id": config["id"],
  38. "title": config["title"].strip(),
  39. "cookie": dict(eval(config["config"]))["cookie"].strip(),
  40. "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(config["update_time"] / 1000))),
  41. "operator": config["operator"].strip()
  42. }
  43. return cookie_dict
  44. @classmethod
  45. def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
  46. mq = MQ(topic_name="topic_crawler_etl_" + env)
  47. next_cursor = 0
  48. special = 0
  49. while True:
  50. flag = user_dict["link"].split("_")[0]
  51. if flag == "V1":
  52. special = 0.01
  53. rule_dict = {
  54. "like_cnt": {"min": 10000, "max": 0},
  55. 'period': {"min": 90, "max": 90},
  56. }
  57. elif flag == "V2":
  58. special = 0.01
  59. rule_dict = {
  60. "like_cnt": {"min": 2000, "max": 0},
  61. 'period': {"min": 90, "max": 90},
  62. }
  63. elif flag == "V3":
  64. special = 0.01
  65. rule_dict = {
  66. "like_cnt": {"min": 100, "max": 0},
  67. 'period': {"min": 90, "max": 90},
  68. }
  69. cookie = cls.get_cookie(log_type, crawler, env)["cookie"]
  70. if user_dict['link'][0] == "V":
  71. link = user_dict["link"][3:]
  72. else:
  73. link = user_dict["link"]
  74. time.sleep(random.randint(5, 10))
  75. url = 'https://www.douyin.com/aweme/v1/web/aweme/post/'
  76. account_id = link
  77. headers = {
  78. 'Accept': 'application/json, text/plain, */*',
  79. 'Accept-Language': 'zh-CN,zh;q=0.9',
  80. 'Cache-Control': 'no-cache',
  81. 'Cookie': cookie,
  82. # 'Cookie': "ttwid=1%7Cyj16cpJ4yxvUv9QWry1Uz3MoZ3Ci7FHGQR4qW3W70ac%7C1704436922%7C435637f1aa3c55cbed4587acf02003b5d74cfcac945a0df3893e041a288ce3c1; bd_ticket_guard_client_web_domain=2; passport_assist_user=CjzcKtls0e65w_tjpggJoAB9du8ZDR8XRxt178-cIHsJhxCRZPLxqAj0PHWKZ4g2xmxWzTHsK7mi4vxt0lcaSgo8W1SZlyQoj2vcxlToyotQ902cRuWULW6HqkHEJHMRcIoo_Y7maHi82HqNSTCVE5xBSQnTOXW31hxsJ4EIENPsxQ0Yia_WVCABIgED619Mew%3D%3D; n_mh=uPso8EqWH8OYYER0xnVFOgB1e9TbTzK9J1CBmr4IQVA; sso_uid_tt=f829ccc6652eae601ff8e56da1fccdb5; sso_uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; toutiao_sso_user=d2fa09f7626319fb35fd2553b5ec5b76; toutiao_sso_user_ss=d2fa09f7626319fb35fd2553b5ec5b76; LOGIN_STATUS=1; store-region=cn-hn; store-region-src=uid; d_ticket=dd5890b4b8f873453c1f1a090b9aa6ccb205c; sid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; ssid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; dy_swidth=1449; dy_sheight=906; __live_version__=%221.1.1.8009%22; live_use_vvc=%22false%22; xgplayer_user_id=510446933624; uid_tt=f829ccc6652eae601ff8e56da1fccdb5; uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; sid_tt=d2fa09f7626319fb35fd2553b5ec5b76; sessionid=d2fa09f7626319fb35fd2553b5ec5b76; sessionid_ss=d2fa09f7626319fb35fd2553b5ec5b76; passport_csrf_token=34235b71f9c981e07032bd9041848f1e; passport_csrf_token_default=34235b71f9c981e07032bd9041848f1e; download_guide=%223%2F20240313%2F1%22; publish_badge_show_info=%220%2C0%2C0%2C1710488198823%22; EnhanceDownloadGuide=%220_0_0_0_2_1710734032%22; sid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; ssid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; sid_guard=d2fa09f7626319fb35fd2553b5ec5b76%7C1710841689%7C5184001%7CSat%2C+18-May-2024+09%3A48%3A10+GMT; __ac_nonce=065fb9e6800dd2b1e14b1; __ac_signature=_02B4Z6wo00f01l39XCgAAIDBYFRGtw.YiKZd3ViAAPKTmnfg2zaxyzrXD6iNtRPPtcoSm5zbE6snYTcix8FTXgxsxQK195O6vG-zEOdZqKTq-ouYFPANlN1Jmu1.ZxBLTzOstKAOorrHEYQN06; douyin.com; xg_device_score=7.654580937785368; device_web_cpu_core=16; device_web_memory_size=8; architecture=amd64; csrf_session_id=f524ab33e8de0e4e922d8b48c362e6c1; strategyABtestKey=%221710988910.691%22; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.281%7D; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A1%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A1%7D%22; my_rd=2; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710989305107%2F0%22; SEARCH_RESULT_LIST_TYPE=%22single%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1449%2C%5C%22screen_height%5C%22%3A906%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A50%7D%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCT05aUWpZcmNjMWhVYXhidlVoUG9uUC9lV0phYzBNbnhTQldxUmZESGFZQ290cUhOSE1GdmJ2ZTdSY1REdVpiemdHUU82cS90dWhzNVdnTmxaeVR3TzQ9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; tt_scid=IMdpIr4sRF90L0IaD0TdSlOy0Sm1rX-hlw5-OAxNAcisxsztezRzg3356KIGHx4cee78; pwa2=%220%7C0%7C1%7C0%22; odin_tt=f05b7460c2544b994f5deae19a5bbf0828870c64564040ef36c9d7cb40da9e44bc41ee52b1cac76d042b80fc4dcb4394; msToken=Tq7-Wv99mG0yhHDIz7-R1fxSAQyf8R7dNAvHMxnjrbWpbi531L8TI6VdQhQSDTAl8jQQJr9IWhJpbRu3E01IgC5uQ7DE_5oGYW046WpPb_bjluz255YhMdqfJ3Qmeg==; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710990128301%2F0%22; msToken=YstZKHMONS09-8nDsHM40jwWV2nr5E1wYmv7cBeAmeY02prkpNLjRwB8C3tp52nc1hxvL5R1F-hkmvDSc0TNeNxz-DNodK3GMV8dK3gkVT8DVPKeVL5umskY5Am5; passport_fe_beating_status=false; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22",
  83. 'Pragma': 'no-cache',
  84. 'Referer': f'https://www.douyin.com/user/{account_id}',
  85. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  86. 'Chrome/118.0.0.0 Safari/537.36',
  87. }
  88. query = DouYinHelper.get_full_query(ua=headers['User-Agent'], extra_data={
  89. 'sec_user_id': account_id,
  90. 'max_cursor': next_cursor,
  91. 'locate_query': 'false',
  92. 'show_live_replay_strategy': '1',
  93. 'need_time_list': '1',
  94. 'time_list_query': '0',
  95. 'whale_cut_token': '',
  96. 'cut_version': '1',
  97. 'count': '18',
  98. 'publish_video_strategy_type': '2',
  99. })
  100. urllib3.disable_warnings()
  101. s = requests.session()
  102. # max_retries=3 重试3次
  103. s.mount('http://', HTTPAdapter(max_retries=3))
  104. s.mount('https://', HTTPAdapter(max_retries=3))
  105. response = requests.request(method='GET', url=url, headers=headers, params=query)
  106. body = response.content.decode()
  107. obj = json.loads(body)
  108. has_more = True if obj.get('has_more', 0) == 1 else False
  109. next_cursor = str(obj.get('max_cursor')) if has_more else None
  110. data = obj.get('aweme_list', [])
  111. response.close()
  112. if response.status_code != 200:
  113. Common.logger(log_type, crawler).warning(f"data:{data}\n")
  114. AliyunLogger.logging(
  115. code="2000",
  116. platform=crawler,
  117. mode=log_type,
  118. env=env,
  119. message=f"data:{data}\n"
  120. )
  121. return
  122. elif len(data) == 0:
  123. Common.logger(log_type, crawler).warning(f"没有更多视频啦 ~\n")
  124. AliyunLogger.logging(
  125. code="2001",
  126. platform=crawler,
  127. mode=log_type,
  128. env=env,
  129. message=f"没有更多视频啦 ~\n"
  130. )
  131. return
  132. for i in range(len(data)):
  133. try:
  134. entity_type = data[i].get('search_impr').get('entity_type')
  135. if entity_type == 'GENERAL':
  136. Common.logger(log_type, crawler).info('扫描到一条视频\n')
  137. AliyunLogger.logging(
  138. code="1001",
  139. platform=crawler,
  140. mode=log_type,
  141. env=env,
  142. message='扫描到一条视频\n'
  143. )
  144. video_id = data[i].get('aweme_id') # 文章id
  145. video_title = data[i].get('desc', "").strip().replace("\n", "") \
  146. .replace("/", "").replace("\\", "").replace("\r", "") \
  147. .replace(":", "").replace("*", "").replace("?", "") \
  148. .replace("?", "").replace('"', "").replace("<", "") \
  149. .replace(">", "").replace("|", "").replace(" ", "") \
  150. .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
  151. .replace("'", "").replace("#", "").replace("Merge", "")
  152. publish_time_stamp = data[i].get('create_time') # 发布时间
  153. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  154. video_url = data[i].get('video').get('play_addr').get('url_list')[0] # 视频链接
  155. cover_url = data[i].get('video').get('cover').get('url_list')[0] # 视频封面
  156. digg_count = int(data[i].get('statistics').get('digg_count')) # 点赞
  157. comment_count = int(data[i].get('statistics').get('comment_count')) # 评论
  158. # collect_count = data[i].get('statistics').get('collect_count') # 收藏
  159. share_count = int(data[i].get('statistics').get('share_count')) # 转发
  160. if special != 0:
  161. if share_count != 0:
  162. video_percent = '%.2f' % (share_count / digg_count)
  163. special = float(special)
  164. if float(video_percent) < special:
  165. Common.logger(log_type, crawler).info(f"不符合条件:分享/点赞-{video_percent}\n")
  166. AliyunLogger.logging(
  167. code="2004",
  168. platform=crawler,
  169. mode=log_type,
  170. env=env,
  171. message=f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n"
  172. )
  173. continue
  174. video_dict = {'video_title': video_title,
  175. 'video_id': video_id,
  176. 'play_cnt': 0,
  177. 'like_cnt': digg_count,
  178. 'comment_cnt': comment_count,
  179. 'share_cnt': share_count,
  180. 'video_width': 0,
  181. 'video_height': 0,
  182. 'duration': 0,
  183. 'publish_time_stamp': publish_time_stamp,
  184. 'publish_time_str': publish_time_str,
  185. 'user_name': "douyin",
  186. 'user_id': video_id,
  187. 'avatar_url': '',
  188. 'cover_url': cover_url,
  189. 'video_url': video_url,
  190. 'session': f"douyin-{int(time.time())}"}
  191. for k, v in video_dict.items():
  192. Common.logger(log_type, crawler).info(f"{k}:{v}")
  193. AliyunLogger.logging(
  194. code="1000",
  195. platform=crawler,
  196. mode=log_type,
  197. env=env,
  198. message=f"{video_dict}\n"
  199. )
  200. if int((int(time.time()) - int(publish_time_stamp)) / (3600*24)) > int(rule_dict.get("period", {}).get("max", 1000)):
  201. Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
  202. AliyunLogger.logging(
  203. code="2004",
  204. platform=crawler,
  205. mode=log_type,
  206. env=env,
  207. message=f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n'
  208. )
  209. return
  210. if video_dict["video_id"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
  211. Common.logger(log_type, crawler).info('无效视频\n')
  212. AliyunLogger.logging(
  213. code="2004",
  214. platform=crawler,
  215. mode=log_type,
  216. env=env,
  217. message='无效视频\n'
  218. )
  219. elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
  220. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  221. AliyunLogger.logging(
  222. code="2004",
  223. platform=crawler,
  224. mode=log_type,
  225. env=env,
  226. message='不满足抓取规则\n'
  227. )
  228. elif any(str(word) if str(word) in video_dict["video_title"] else False
  229. for word in get_config_from_mysql(log_type=log_type,
  230. source=crawler,
  231. env=env,
  232. text="filter",
  233. action="")) is True:
  234. Common.logger(log_type, crawler).info('已中过滤词\n')
  235. AliyunLogger.logging(
  236. code="2004",
  237. platform=crawler,
  238. mode=log_type,
  239. env=env,
  240. message='已中过滤词\n'
  241. )
  242. elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
  243. Common.logger(log_type, crawler).info('视频已下载\n')
  244. AliyunLogger.logging(
  245. code="2002",
  246. platform=crawler,
  247. mode=log_type,
  248. env=env,
  249. message='视频已下载\n'
  250. )
  251. else:
  252. video_dict["out_user_id"] = video_dict["user_id"]
  253. video_dict["platform"] = crawler
  254. video_dict["strategy"] = log_type
  255. video_dict["out_video_id"] = video_dict["video_id"]
  256. video_dict["width"] = video_dict["video_width"]
  257. video_dict["height"] = video_dict["video_height"]
  258. video_dict["crawler_rule"] = json.dumps(rule_dict)
  259. video_dict["user_id"] = user_dict["uid"]
  260. video_dict["publish_time"] = video_dict["publish_time_str"]
  261. video_dict["strategy_type"] = log_type
  262. limit_flag = cls.limiter.author_limitation(user_id=video_dict['user_id'])
  263. if limit_flag:
  264. mq.send_msg(video_dict)
  265. cls.download_cnt += 1
  266. except Exception as e:
  267. Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
  268. AliyunLogger.logging(
  269. code="3000",
  270. platform=crawler,
  271. mode=log_type,
  272. env=env,
  273. message=f"抓取单条视频异常:{e}\n"
  274. )
  275. @classmethod
  276. def repeat_video(cls, log_type, crawler, video_id, env):
  277. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
  278. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  279. return len(repeat_video)
  280. @classmethod
  281. def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
  282. for user_dict in user_list:
  283. try:
  284. Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 主页视频")
  285. AliyunLogger.logging(
  286. code="2000",
  287. platform=crawler,
  288. mode=log_type,
  289. env=env,
  290. message=f"开始抓取 {user_dict['nick_name']} 主页视频"
  291. )
  292. cls.download_cnt = 0
  293. cls.get_videoList(log_type=log_type,
  294. crawler=crawler,
  295. user_dict=user_dict,
  296. rule_dict=rule_dict,
  297. env=env)
  298. except Exception as e:
  299. Common.logger(log_type, crawler).warning(f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
  300. AliyunLogger.logging(
  301. code="3000",
  302. platform=crawler,
  303. mode=log_type,
  304. env=env,
  305. message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
  306. )
  307. if __name__ == "__main__":
  308. print(DouyinauthorScheduling.get_cookie("author", "douyin", "prod")["cookie"])
  309. pass