douyin_author_scheduling_new.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. # -*- coding: utf-8 -*-
  2. # @Time: 2023/11/07
  3. import os
  4. import random
  5. import sys
  6. import time
  7. import cv2
  8. import requests
  9. import json
  10. import urllib3
  11. sys.path.append(os.getcwd())
  12. from datetime import timedelta, date
  13. from common.common import Common
  14. from common import AliyunLogger
  15. from common.mq import MQ
  16. from requests.adapters import HTTPAdapter
  17. from common.scheduling_db import MysqlHelper
  18. from common.public import get_config_from_mysql, download_rule
  19. from douyin.douyin_author.douyin_author_scheduling_help import DouYinHelper
  20. from common.limit import AuthorLimit
  21. class DouyinauthorScheduling:
  22. platform = "抖音"
  23. download_cnt = 0
  24. limiter = AuthorLimit(platform="douyin", mode="author")
  25. @classmethod
  26. def videos_cnt(cls, rule_dict):
  27. videos_cnt = rule_dict.get("videos_cnt", {}).get("min", 0)
  28. if videos_cnt == 0:
  29. videos_cnt = 1000
  30. return videos_cnt
  31. @classmethod
  32. def get_cookie(cls, log_type, crawler, env):
  33. select_sql = f""" select * from crawler_config where source="{crawler}" """
  34. configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
  35. for config in configs:
  36. if "cookie" in config["config"]:
  37. cookie_dict = {
  38. "cookie_id": config["id"],
  39. "title": config["title"].strip(),
  40. "cookie": dict(eval(config["config"]))["cookie"].strip(),
  41. "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(config["update_time"] / 1000))),
  42. "operator": config["operator"].strip()
  43. }
  44. return cookie_dict
  45. @classmethod
  46. def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
  47. mq = MQ(topic_name="topic_crawler_etl_" + env)
  48. next_cursor = 0
  49. special = 0
  50. for i in range(3):
  51. # while True:
  52. flag = user_dict["link"].split("_")[0]
  53. if flag == "V1":
  54. special = 0.01
  55. rule_dict = {
  56. 'period': {"min": 15, "max": 0},
  57. }
  58. elif flag == "V2":
  59. special = 0.01
  60. rule_dict = {
  61. 'period': {"min": 15, "max": 0},
  62. }
  63. elif flag == "V3":
  64. special = 0.01
  65. rule_dict = {
  66. 'period': {"min": 15, "max": 0},
  67. }
  68. cookie = cls.get_cookie(log_type, crawler, env)["cookie"]
  69. if user_dict['link'][0] == "V":
  70. link = user_dict["link"][3:]
  71. else:
  72. link = user_dict["link"]
  73. time.sleep(random.randint(5, 10))
  74. url = 'https://www.douyin.com/aweme/v1/web/aweme/post/'
  75. account_id = link
  76. headers = {
  77. 'Accept': 'application/json, text/plain, */*',
  78. 'Accept-Language': 'zh-CN,zh;q=0.9',
  79. 'Cache-Control': 'no-cache',
  80. 'Cookie': cookie,
  81. # 'Cookie': "ttwid=1%7Cyj16cpJ4yxvUv9QWry1Uz3MoZ3Ci7FHGQR4qW3W70ac%7C1704436922%7C435637f1aa3c55cbed4587acf02003b5d74cfcac945a0df3893e041a288ce3c1; bd_ticket_guard_client_web_domain=2; passport_assist_user=CjzcKtls0e65w_tjpggJoAB9du8ZDR8XRxt178-cIHsJhxCRZPLxqAj0PHWKZ4g2xmxWzTHsK7mi4vxt0lcaSgo8W1SZlyQoj2vcxlToyotQ902cRuWULW6HqkHEJHMRcIoo_Y7maHi82HqNSTCVE5xBSQnTOXW31hxsJ4EIENPsxQ0Yia_WVCABIgED619Mew%3D%3D; n_mh=uPso8EqWH8OYYER0xnVFOgB1e9TbTzK9J1CBmr4IQVA; sso_uid_tt=f829ccc6652eae601ff8e56da1fccdb5; sso_uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; toutiao_sso_user=d2fa09f7626319fb35fd2553b5ec5b76; toutiao_sso_user_ss=d2fa09f7626319fb35fd2553b5ec5b76; LOGIN_STATUS=1; store-region=cn-hn; store-region-src=uid; d_ticket=dd5890b4b8f873453c1f1a090b9aa6ccb205c; sid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; ssid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; dy_swidth=1449; dy_sheight=906; __live_version__=%221.1.1.8009%22; live_use_vvc=%22false%22; xgplayer_user_id=510446933624; uid_tt=f829ccc6652eae601ff8e56da1fccdb5; uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; sid_tt=d2fa09f7626319fb35fd2553b5ec5b76; sessionid=d2fa09f7626319fb35fd2553b5ec5b76; sessionid_ss=d2fa09f7626319fb35fd2553b5ec5b76; passport_csrf_token=34235b71f9c981e07032bd9041848f1e; passport_csrf_token_default=34235b71f9c981e07032bd9041848f1e; download_guide=%223%2F20240313%2F1%22; publish_badge_show_info=%220%2C0%2C0%2C1710488198823%22; EnhanceDownloadGuide=%220_0_0_0_2_1710734032%22; sid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; ssid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; sid_guard=d2fa09f7626319fb35fd2553b5ec5b76%7C1710841689%7C5184001%7CSat%2C+18-May-2024+09%3A48%3A10+GMT; __ac_nonce=065fb9e6800dd2b1e14b1; __ac_signature=_02B4Z6wo00f01l39XCgAAIDBYFRGtw.YiKZd3ViAAPKTmnfg2zaxyzrXD6iNtRPPtcoSm5zbE6snYTcix8FTXgxsxQK195O6vG-zEOdZqKTq-ouYFPANlN1Jmu1.ZxBLTzOstKAOorrHEYQN06; douyin.com; xg_device_score=7.654580937785368; device_web_cpu_core=16; device_web_memory_size=8; architecture=amd64; csrf_session_id=f524ab33e8de0e4e922d8b48c362e6c1; strategyABtestKey=%221710988910.691%22; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.281%7D; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A1%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A1%7D%22; my_rd=2; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710989305107%2F0%22; SEARCH_RESULT_LIST_TYPE=%22single%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1449%2C%5C%22screen_height%5C%22%3A906%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A50%7D%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCT05aUWpZcmNjMWhVYXhidlVoUG9uUC9lV0phYzBNbnhTQldxUmZESGFZQ290cUhOSE1GdmJ2ZTdSY1REdVpiemdHUU82cS90dWhzNVdnTmxaeVR3TzQ9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; tt_scid=IMdpIr4sRF90L0IaD0TdSlOy0Sm1rX-hlw5-OAxNAcisxsztezRzg3356KIGHx4cee78; pwa2=%220%7C0%7C1%7C0%22; odin_tt=f05b7460c2544b994f5deae19a5bbf0828870c64564040ef36c9d7cb40da9e44bc41ee52b1cac76d042b80fc4dcb4394; msToken=Tq7-Wv99mG0yhHDIz7-R1fxSAQyf8R7dNAvHMxnjrbWpbi531L8TI6VdQhQSDTAl8jQQJr9IWhJpbRu3E01IgC5uQ7DE_5oGYW046WpPb_bjluz255YhMdqfJ3Qmeg==; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710990128301%2F0%22; msToken=YstZKHMONS09-8nDsHM40jwWV2nr5E1wYmv7cBeAmeY02prkpNLjRwB8C3tp52nc1hxvL5R1F-hkmvDSc0TNeNxz-DNodK3GMV8dK3gkVT8DVPKeVL5umskY5Am5; passport_fe_beating_status=false; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22",
  82. 'Pragma': 'no-cache',
  83. 'Referer': f'https://www.douyin.com/user/{account_id}',
  84. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  85. 'Chrome/118.0.0.0 Safari/537.36',
  86. }
  87. query = DouYinHelper.get_full_query(ua=headers['User-Agent'], extra_data={
  88. 'sec_user_id': account_id,
  89. 'max_cursor': next_cursor,
  90. 'locate_query': 'false',
  91. 'show_live_replay_strategy': '1',
  92. 'need_time_list': '1',
  93. 'time_list_query': '0',
  94. 'whale_cut_token': '',
  95. 'cut_version': '1',
  96. 'count': '18',
  97. 'publish_video_strategy_type': '2',
  98. })
  99. urllib3.disable_warnings()
  100. s = requests.session()
  101. # max_retries=3 重试3次
  102. s.mount('http://', HTTPAdapter(max_retries=3))
  103. s.mount('https://', HTTPAdapter(max_retries=3))
  104. response = requests.request(method='GET', url=url, headers=headers, params=query)
  105. body = response.content.decode()
  106. obj = json.loads(body)
  107. has_more = True if obj.get('has_more', 0) == 1 else False
  108. next_cursor = str(obj.get('max_cursor')) if has_more else None
  109. data = obj.get('aweme_list', [])
  110. response.close()
  111. if response.status_code != 200:
  112. Common.logger(log_type, crawler).warning(f"data:{data}\n")
  113. AliyunLogger.logging(
  114. code="2000",
  115. platform=crawler,
  116. mode=log_type,
  117. env=env,
  118. message=f"data:{data}\n"
  119. )
  120. return
  121. elif len(data) == 0:
  122. Common.logger(log_type, crawler).warning(f"没有更多视频啦 ~\n")
  123. AliyunLogger.logging(
  124. code="2001",
  125. platform=crawler,
  126. mode=log_type,
  127. env=env,
  128. message=f"没有更多视频啦 ~\n"
  129. )
  130. return
  131. for i in range(len(data)):
  132. try:
  133. entity_type = data[i].get('search_impr').get('entity_type')
  134. if entity_type == 'GENERAL':
  135. Common.logger(log_type, crawler).info('扫描到一条视频\n')
  136. AliyunLogger.logging(
  137. code="1001",
  138. platform=crawler,
  139. mode=log_type,
  140. env=env,
  141. message='扫描到一条视频\n'
  142. )
  143. is_top = data[i].get('is_top') # 是否置顶
  144. video_id = data[i].get('aweme_id') # 文章id
  145. video_title = data[i].get('desc', "").strip().replace("\n", "") \
  146. .replace("/", "").replace("\\", "").replace("\r", "") \
  147. .replace(":", "").replace("*", "").replace("?", "") \
  148. .replace("?", "").replace('"', "").replace("<", "") \
  149. .replace(">", "").replace("|", "").replace(" ", "") \
  150. .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
  151. .replace("'", "").replace("#", "").replace("Merge", "")
  152. publish_time_stamp = data[i].get('create_time') # 发布时间
  153. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  154. # video_url = data[i].get('video').get('play_addr').get('url_list')[0] # 视频链接
  155. video_uri = data[i].get('video', {}).get('play_addr', {}).get('uri')
  156. ratio = f'{data[i].get("video", {}).get("height")}p'
  157. video_url = f'https://www.iesdouyin.com/aweme/v1/play/?video_id={video_uri}&ratio={ratio}&line=0' # 视频链接
  158. cover_url = data[i].get('video').get('cover').get('url_list')[0] # 视频封面
  159. digg_count = int(data[i].get('statistics').get('digg_count')) # 点赞
  160. comment_count = int(data[i].get('statistics').get('comment_count')) # 评论
  161. # collect_count = data[i].get('statistics').get('collect_count') # 收藏
  162. share_count = int(data[i].get('statistics').get('share_count')) # 转发
  163. if share_count < 500:
  164. AliyunLogger.logging(
  165. code="2004",
  166. platform=crawler,
  167. mode=log_type,
  168. env=env,
  169. message=f'分享小于500\n'
  170. )
  171. continue
  172. video_percent = '%.2f' % (share_count / digg_count)
  173. special = float(0.25)
  174. if float(video_percent) < special:
  175. AliyunLogger.logging(
  176. code="2004",
  177. platform=crawler,
  178. mode=log_type,
  179. env=env,
  180. message=f'分享/点赞小于25%\n'
  181. )
  182. continue
  183. duration = cls.video_duration(video_url)
  184. if int(duration) < 45:
  185. AliyunLogger.logging(
  186. code="2004",
  187. platform=crawler,
  188. mode=log_type,
  189. env=env,
  190. message=f'视频时常小于45秒\n'
  191. )
  192. continue
  193. # if special != 0:
  194. # if share_count != 0:
  195. # video_percent = '%.2f' % (share_count / digg_count)
  196. # special = float(special)
  197. # if float(video_percent) < special:
  198. # Common.logger(log_type, crawler).info(f"不符合条件:分享/点赞-{video_percent}\n")
  199. # AliyunLogger.logging(
  200. # code="2004",
  201. # platform=crawler,
  202. # mode=log_type,
  203. # env=env,
  204. # message=f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n"
  205. # )
  206. # continue
  207. video_dict = {'video_title': video_title,
  208. 'video_id': video_id,
  209. 'play_cnt': 0,
  210. 'like_cnt': digg_count,
  211. 'comment_cnt': comment_count,
  212. 'share_cnt': share_count,
  213. 'video_width': 0,
  214. 'video_height': 0,
  215. 'duration': 0,
  216. 'publish_time_stamp': publish_time_stamp,
  217. 'publish_time_str': publish_time_str,
  218. 'user_name': "douyin",
  219. 'user_id': video_id,
  220. 'avatar_url': '',
  221. 'cover_url': cover_url,
  222. 'video_url': video_url,
  223. 'session': f"douyin-{int(time.time())}"}
  224. for k, v in video_dict.items():
  225. Common.logger(log_type, crawler).info(f"{k}:{v}")
  226. AliyunLogger.logging(
  227. code="1000",
  228. platform=crawler,
  229. mode=log_type,
  230. env=env,
  231. message=f"{video_dict}\n"
  232. )
  233. if is_top == 0:
  234. if int((int(time.time()) - int(publish_time_stamp)) / (3600*24)) > int(rule_dict.get("period", {}).get("max", 1000)):
  235. Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
  236. AliyunLogger.logging(
  237. code="2004",
  238. platform=crawler,
  239. mode=log_type,
  240. env=env,
  241. message=f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n'
  242. )
  243. return
  244. if video_dict["video_id"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
  245. Common.logger(log_type, crawler).info('无效视频\n')
  246. AliyunLogger.logging(
  247. code="2004",
  248. platform=crawler,
  249. mode=log_type,
  250. env=env,
  251. message='无效视频\n'
  252. )
  253. elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
  254. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  255. AliyunLogger.logging(
  256. code="2004",
  257. platform=crawler,
  258. mode=log_type,
  259. env=env,
  260. message='不满足抓取规则\n'
  261. )
  262. elif any(str(word) if str(word) in video_dict["video_title"] else False
  263. for word in get_config_from_mysql(log_type=log_type,
  264. source=crawler,
  265. env=env,
  266. text="filter",
  267. action="")) is True:
  268. Common.logger(log_type, crawler).info('已中过滤词\n')
  269. AliyunLogger.logging(
  270. code="2004",
  271. platform=crawler,
  272. mode=log_type,
  273. env=env,
  274. message='已中过滤词\n'
  275. )
  276. elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
  277. Common.logger(log_type, crawler).info('视频已下载\n')
  278. AliyunLogger.logging(
  279. code="2002",
  280. platform=crawler,
  281. mode=log_type,
  282. env=env,
  283. message='视频已下载\n'
  284. )
  285. else:
  286. video_dict["out_user_id"] = video_dict["user_id"]
  287. video_dict["platform"] = crawler
  288. video_dict["strategy"] = log_type
  289. video_dict["out_video_id"] = video_dict["video_id"]
  290. video_dict["width"] = video_dict["video_width"]
  291. video_dict["height"] = video_dict["video_height"]
  292. video_dict["crawler_rule"] = json.dumps(rule_dict)
  293. video_dict["user_id"] = user_dict["uid"]
  294. video_dict["publish_time"] = video_dict["publish_time_str"]
  295. video_dict["strategy_type"] = log_type
  296. limit_flag = cls.limiter.author_limitation(user_id=video_dict['user_id'])
  297. if limit_flag:
  298. mq.send_msg(video_dict)
  299. cls.download_cnt += 1
  300. AliyunLogger.logging(code="1002", message="成功发送至 ETL", data=video_dict)
  301. except Exception as e:
  302. Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
  303. AliyunLogger.logging(
  304. code="3000",
  305. platform=crawler,
  306. mode=log_type,
  307. env=env,
  308. message=f"抓取单条视频异常:{e}\n"
  309. )
  310. @classmethod
  311. def repeat_video(cls, log_type, crawler, video_id, env):
  312. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
  313. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  314. return len(repeat_video)
  315. @classmethod
  316. def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
  317. for user_dict in user_list:
  318. try:
  319. Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 主页视频")
  320. AliyunLogger.logging(
  321. code="2000",
  322. platform=crawler,
  323. mode=log_type,
  324. env=env,
  325. message=f"开始抓取 {user_dict['nick_name']} 主页视频"
  326. )
  327. cls.download_cnt = 0
  328. cls.get_videoList(log_type=log_type,
  329. crawler=crawler,
  330. user_dict=user_dict,
  331. rule_dict=rule_dict,
  332. env=env)
  333. except Exception as e:
  334. Common.logger(log_type, crawler).warning(f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
  335. AliyunLogger.logging(
  336. code="3000",
  337. platform=crawler,
  338. mode=log_type,
  339. env=env,
  340. message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
  341. )
  342. @classmethod
  343. def video_duration(cls, filename):
  344. cap = cv2.VideoCapture(filename)
  345. if cap.isOpened():
  346. rate = cap.get(5)
  347. frame_num = cap.get(7)
  348. duration = frame_num / rate
  349. return duration
  350. return 0
  351. if __name__ == "__main__":
  352. print(DouyinauthorScheduling.get_cookie("author", "douyin", "prod")["cookie"])
  353. pass