recommend_dy.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. # -*- coding: utf-8 -*-
  2. # @Author: lierqiang
  3. # @Time: 2023/4/06
  4. import json
  5. import os
  6. import shutil
  7. import sys
  8. import time
  9. import requests
  10. from hashlib import md5
  11. sys.path.append(os.getcwd())
  12. from common.common import Common
  13. from common.feishu import Feishu
  14. from common.db import MysqlHelper
  15. from common.publish import Publish
  16. from douyin.douyin_recommend import get_xb
  17. from common.public import get_config_from_mysql
  18. from common.public import random_title
  19. from common.userAgent import get_random_user_agent
  20. class DyRecommend(object):
  21. platform = "抖音"
  22. @classmethod
  23. def get_rule(cls, log_type, crawler):
  24. try:
  25. while True:
  26. rule_sheet = Feishu.get_values_batch(log_type, crawler, "a6L9Kb")
  27. if rule_sheet is None:
  28. Common.logger(log_type, crawler).warning("rule_sheet is None! 10秒后重新获取")
  29. time.sleep(10)
  30. continue
  31. rule_dict = {
  32. "video_width": int(rule_sheet[0][2]),
  33. "video_height": int(rule_sheet[1][2]),
  34. "like_cnt": int(rule_sheet[2][2]),
  35. "duration": int(rule_sheet[3][2]),
  36. "publish_time": int(rule_sheet[4][2]),
  37. "share_cnt": int(rule_sheet[5][2])
  38. }
  39. return rule_dict
  40. except Exception as e:
  41. Common.logger(log_type, crawler).error(f"get_rule:{e}\n")
  42. # 下载规则
  43. @classmethod
  44. def download_rule(cls, video_info_dict, rule_dict):
  45. if video_info_dict['like_cnt'] >= rule_dict['like_cnt']:
  46. if video_info_dict['duration'] >= rule_dict['duration']:
  47. if video_info_dict['video_width'] >= rule_dict['video_width'] \
  48. or video_info_dict['video_height'] >= rule_dict['video_height']:
  49. if video_info_dict['share_cnt'] >= rule_dict['share_cnt']:
  50. return True
  51. else:
  52. return False
  53. else:
  54. return False
  55. else:
  56. return False
  57. else:
  58. return False
  59. # 过滤词库
  60. @classmethod
  61. def filter_words(cls, log_type, crawler):
  62. try:
  63. while True:
  64. filter_words_sheet = Feishu.get_values_batch(log_type, crawler, '6BS2RR')
  65. if filter_words_sheet is None:
  66. Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
  67. continue
  68. filter_words_list = []
  69. for x in filter_words_sheet:
  70. for y in x:
  71. if y is None:
  72. pass
  73. else:
  74. filter_words_list.append(y)
  75. return filter_words_list
  76. except Exception as e:
  77. Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
  78. @classmethod
  79. def video_title(cls, log_type, crawler, env, title):
  80. title_split1 = title.split(" #")
  81. if title_split1[0] != "":
  82. title1 = title_split1[0]
  83. else:
  84. title1 = title_split1[-1]
  85. title_split2 = title1.split(" #")
  86. if title_split2[0] != "":
  87. title2 = title_split2[0]
  88. else:
  89. title2 = title_split2[-1]
  90. title_split3 = title2.split("@")
  91. if title_split3[0] != "":
  92. title3 = title_split3[0]
  93. else:
  94. title3 = title_split3[-1]
  95. video_title = title3.strip().replace("\n", "") \
  96. .replace("/", "").replace("抖音", "").replace(" ", "") \
  97. .replace(" ", "").replace("&NBSP", "").replace("\r", "") \
  98. .replace("#", "").replace(".", "。").replace("\\", "") \
  99. .replace(":", "").replace("*", "").replace("?", "") \
  100. .replace("?", "").replace('"', "").replace("<", "") \
  101. .replace(">", "").replace("|", "").replace("@", "").replace('"', '').replace("'", '')[:40]
  102. if video_title.replace(" ", "") == "" or video_title == "。。。" or video_title == "...":
  103. return random_title(log_type, crawler, env, text='title')
  104. else:
  105. return video_title
  106. @classmethod
  107. def get_videolist(cls, log_type, crawler, strategy, our_id, oss_endpoint, env, machine):
  108. rule_dict = cls.get_rule(log_type, crawler)
  109. for page in range(1, 101):
  110. try:
  111. aweme_pc_rec_raw_data = '%7B%22videoPrefer%22:%7B%22fsn%22:%5B%5D,%22like%22:%5B%5D,%22halfMin%22:%5B%227188684310696742200%22,%224380080926896941%22%5D,%22min%22:%5B%5D%7D,%22seo_info%22:%22https:%2F%2Fwww.douyin.com%2F%22,%22is_client%22:false,%22ff_danmaku_status%22:1,%22danmaku_switch_status%22:1%7D'
  112. f_url = 'https://www.douyin.com/aweme/v1/web/tab/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=&share_aweme_id=&count=10&refresh_index={page}&video_type_select=1&aweme_pc_rec_raw_data={aweme_pc_rec_raw_data}&globalwid=&pull_type=2&min_window=0&ug_source=&creative_id=&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1440&screen_height=900&browser_language=zh-CN&browser_platform=MacIntel&browser_name=Chrome&browser_version=109.0.0.0&browser_online=true&engine_name=Blink&engine_version=109.0.0.0&os_name=Mac+OS&os_version=10.15.7&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7219223873342260736&msToken=Sh6bVLWZUEZ3ruIHq1L3iUXnr1GT5yklyo-XZRO7lNgsFvpYq0C7tcu5Z4Jv9DrMESZ9kGVhwKT4ftFDkBL11ZGPUxth2ToA4M4q-qs2MK9ctR7GhwFkGw=='.format(
  113. page=page, aweme_pc_rec_raw_data=aweme_pc_rec_raw_data)
  114. headers = {
  115. # 'cookie': 'ttwid=1%7CzpiG_VTvd1xpRFvfHUjEHiaq3qkfUqPElZUu0wbTSr8%7C1680507728%7Cc61697d37b4d4d49d42b466a3bbe8ecd5c06ae6e9a751d9e410102d2c52a185d; douyin.com; passport_csrf_token=208a829b0156a2feaa0fa24ad026ea91; passport_csrf_token_default=208a829b0156a2feaa0fa24ad026ea91; s_v_web_id=verify_lg0iwv1g_BwfztkmU_azbL_4Gua_9Fb9_KWfGPVXCyWua; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtY2xpZW50LWNzciI6Ii0tLS0tQkVHSU4gQ0VSVElGSUNBVEUgUkVRVUVTVC0tLS0tXHJcbk1JSUJEekNCdFFJQkFEQW5NUXN3Q1FZRFZRUUdFd0pEVGpFWU1CWUdBMVVFQXd3UFltUmZkR2xqYTJWMFgyZDFcclxuWVhKa01Ga3dFd1lIS29aSXpqMENBUVlJS29aSXpqMERBUWNEUWdBRU9zWGhGbG5ZWjVNeG5ZRGFFOCtCYmRGdFxyXG5VZTh6SG0ycTRXeWxvdkxXVXVOcy9oV2tlZlBRK3BsNkg2OGQwdGtOVVB5UStmUnpyWlRFL1ZXMTR5UlRkS0FzXHJcbk1Db0dDU3FHU0liM0RRRUpEakVkTUJzd0dRWURWUjBSQkJJd0VJSU9kM2QzTG1SdmRYbHBiaTVqYjIwd0NnWUlcclxuS29aSXpqMEVBd0lEU1FBd1JnSWhBTmdPS3Jkb3V4SHBzcHNiY0dmUHJYQ0lVNnVwcmZkd2ZFY2g5TXZndW5Ea1xyXG5BaUVBM2xVeDQ2bzd0UWJUT0dXdzgzQm45RnFyQkRVVHNOVjkyZUEyR1hPR3BkVT1cclxuLS0tLS1FTkQgQ0VSVElGSUNBVEUgUkVRVUVTVC0tLS0tXHJcbiJ9; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; ttcid=60304e8eb309434784f02372ef36387b41; xgplayer_user_id=446878319428; __ac_nonce=0642bcb0a001596f3fe5b; __ac_signature=_02B4Z6wo00f012HU-wAAAIDA9QKgDGYSlVNh9P-AALxqUQdGOEO.l3IAhdmUh4D-Y9rXLut3p7moXUuAUmo7rUOUJzpnB9nLx0YdZcvdMNeUgQOjsGIHh9LTN38BOVtrElZBeXDLjuVVC5Hh81; strategyABtestKey=%221680591628.189%22; download_guide=%223%2F20230404%22; VIDEO_FILTER_MEMO_SELECT=%7B%22expireTime%22%3A1681196521206%2C%22type%22%3A0%7D; home_can_add_dy_2_desktop=%221%22; msToken=v_pzGLfpXwl4PugynDwIb5DeepUms68tZLZFNLHl8WQnEeNQZtaawWYVu4Y3TLWxpqbvgqkOFULGmld2BLBZydbzrMJgkx5q5GjqetkVI4GoxLX1QdJQ0CP607uEVw==; tt_scid=NZGTg99heu5lHFfAvBht7p3Qxl0TGP.TyfxOQ7cWIvZjEnOcZERFaJxQ.HnKY-UT18cb; msToken=JFY-VD2YHS-6IO6lNhTj4AcLVtHqjein_5giYzIUh_VRsXMPFXy9QOg-RKDDwQgW5TTbgQB_BLzpfQhNdNEQCv5sGXatzGei9yppG1eSLLkbI9fjhTdBWtdkAJpLIg==',
  116. 'cookie': 'ttwid=1%7CI2Xp275XabSiVJ9GAmfLtqbtqturVSIS2yLbXVkAHnQ%7C1680856567%7Cfd36579475157e2303e36e5fd75cdea4ebad78c20da989be0590305f169242ca; douyin.com; strategyABtestKey=%221680856567.817%22; passport_csrf_token=6a28a7b1e1ad38570cc5ee39deaf587a; passport_csrf_token_default=6a28a7b1e1ad38570cc5ee39deaf587a; s_v_web_id=verify_lg6aloex_WG2LmlFk_kbA6_4tMb_BZlD_iuKxRvsMvrg7; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtY2xpZW50LWNzciI6Ii0tLS0tQkVHSU4gQ0VSVElGSUNBVEUgUkVRVUVTVC0tLS0tXHJcbk1JSUJEakNCdFFJQkFEQW5NUXN3Q1FZRFZRUUdFd0pEVGpFWU1CWUdBMVVFQXd3UFltUmZkR2xqYTJWMFgyZDFcclxuWVhKa01Ga3dFd1lIS29aSXpqMENBUVlJS29aSXpqMERBUWNEUWdBRWNFZitKV2tDb3l4OHhLekNnY0hReEVaRFxyXG5hN29maHJhUG9rMkQ5b1RaRGRvbDJuTVhaTis5dGJFclV0cVdUcm81ck4zekFyWTFLaXIzRlRUR2ZQUXRmS0FzXHJcbk1Db0dDU3FHU0liM0RRRUpEakVkTUJzd0dRWURWUjBSQkJJd0VJSU9kM2QzTG1SdmRYbHBiaTVqYjIwd0NnWUlcclxuS29aSXpqMEVBd0lEU0FBd1JRSWhBUFZaSGNFdW5HeGtBZFNmQXJ1MmdWb1RHbFhINkhsa1prRzZNc1pyR2hBL1xyXG5BaUJsV2NpM3h5SDk2UnJlTXpPSy8xVmFJQUNuTWUyU0RodUJIY2ZZaE80OWtRPT1cclxuLS0tLS1FTkQgQ0VSVElGSUNBVEUgUkVRVUVTVC0tLS0tXHJcbiJ9; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; ttcid=585ca9110e2345a09b4499dc543ec39959; odin_tt=d5fbc530c34bf8d4ea8cf8030c85a3f70202ee34dad7fb5b9ef7ff299d20e38ed01c852f02f2f3f864f2a45060480717f72083b58cdbc204c988edbf997fda7c; xgplayer_user_id=64984931555; SEARCH_RESULT_LIST_TYPE=%22single%22; pwa2=%222%7C1%22; download_guide=%223%2F20230407%22; __ac_nonce=0642fff310028297fec16; __ac_signature=_02B4Z6wo00f01M3AeJAAAIDDWRYjnPVAtVzN4HwAAFdlurQp2aR1Npvb7RYeaCY4fZs3DkMFlu7-Obn7zsvc34whBQesvTIc2p8nV-1crQtTacaxSYqP8nwNk3WqH.tkNdKQaUMw6sCC2.xZ15; VIDEO_FILTER_MEMO_SELECT=%7B%22expireTime%22%3A1681472712797%2C%22type%22%3A1%7D; home_can_add_dy_2_desktop=%221%22; tt_scid=PNmY2BL-Q9E9lAfdecenRGOzq64XgFOc0CJGFMN.JIE-QJO51S3Zvw56-Z6O12QVf4d9; msToken=uFCSX87jL9sTq6ScVYJucYgv9Hd5gCbTPvKIGBRMgVLuo7Pp9zLRrutBYzq4BmnCr83WnJAwZb8H78lBr3s3eyJLnySxYO5FgClQRXW1i_mAu7fLfBj3gA==; msToken=Sh6bVLWZUEZ3ruIHq1L3iUXnr1GT5yklyo-XZRO7lNgsFvpYq0C7tcu5Z4Jv9DrMESZ9kGVhwKT4ftFDkBL11ZGPUxth2ToA4M4q-qs2MK9ctR7GhwFkGw==',
  117. 'referer': 'https://www.douyin.com/',
  118. 'user-agent': get_random_user_agent('pc')
  119. }
  120. try:
  121. x_bogus = get_xb(f_url, headers['user-agent'])
  122. if not x_bogus:
  123. continue
  124. url = f_url + '&X-Bogus={}'.format(x_bogus)
  125. res = requests.get(url=url, headers=headers, proxies=Common.tunnel_proxies()).json()
  126. aweme_list = res.get('aweme_list', [])
  127. except Exception as e:
  128. Common.logger(log_type, crawler).error(f"获取抖音推荐失败:{e}")
  129. continue
  130. if not aweme_list:
  131. Common.logger(log_type, crawler).warning(f"抖音推荐没有获取到更多数据,页数:{page}")
  132. continue
  133. for info in aweme_list:
  134. try:
  135. if info.get('is_ads'):
  136. continue
  137. publish_time = info.get('create_time')
  138. if not publish_time:
  139. continue
  140. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
  141. publish_day = int((int(time.time()) - publish_time) / (3600 * 24))
  142. if not info['desc']:
  143. video_title = random_title(log_type, crawler, env, text='title')
  144. else:
  145. video_title = cls.video_title(log_type, crawler, env, info['desc'])
  146. video_dict = {'video_title': video_title,
  147. 'video_id': info['aweme_id'],
  148. 'play_cnt': info['statistics']['play_count'],
  149. 'comment_cnt': info['statistics']['comment_count'],
  150. 'like_cnt': info['statistics']['digg_count'],
  151. 'share_cnt': info['statistics']['share_count'],
  152. 'video_width': info['video']['width'],
  153. 'video_height': info['video']['height'],
  154. 'duration': round(info['video']['duration'] / 1000),
  155. 'publish_time': publish_day,
  156. 'publish_time_stamp': publish_time * 1000,
  157. 'publish_time_str': publish_time_str,
  158. 'user_name': info['author']['nickname'],
  159. 'user_id': info['author_user_id'],
  160. 'user_sec_id': info['author']['sec_uid'],
  161. 'avatar_url': info['author']['avatar_thumb']['url_list'][0],
  162. 'cover_url': info['video']['origin_cover']['url_list'][0].replace('\u0026', '&'),
  163. 'video_url': info['video']['play_addr']['url_list'][0],
  164. 'session': f"douyin{int(time.time())}"
  165. }
  166. for k, v in video_dict.items():
  167. Common.logger(log_type, crawler).info(f"{k}:{v}")
  168. cls.download_publish(log_type=log_type,
  169. crawler=crawler,
  170. video_dict=video_dict,
  171. rule_dict=rule_dict,
  172. strategy=strategy,
  173. our_uid=our_id,
  174. oss_endpoint=oss_endpoint,
  175. env=env,
  176. machine=machine)
  177. except Exception as e:
  178. Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
  179. except Exception as e:
  180. Common.logger(log_type, crawler).error(f"抓取第{page}页时异常:{e}\n")
  181. @classmethod
  182. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  183. sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
  184. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  185. return len(repeat_video)
  186. # 下载 / 上传
  187. @classmethod
  188. def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
  189. filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
  190. for filter_word in filter_words:
  191. if filter_word in video_dict['video_title']:
  192. Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
  193. return
  194. if cls.download_rule(video_dict, rule_dict) is False:
  195. Common.logger(log_type, crawler).info('不满足抓取规则\n')
  196. elif any(word if word in video_dict['video_title'] else False for word in
  197. cls.filter_words(log_type, crawler)) is True:
  198. Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
  199. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  200. Common.logger(log_type, crawler).info('视频已下载\n')
  201. else:
  202. # 下载视频
  203. Common.download_method(log_type=log_type, crawler=crawler, text='video',
  204. title=video_dict['video_title'], url=video_dict['video_url'])
  205. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  206. try:
  207. if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
  208. # 删除视频文件夹
  209. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  210. Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
  211. return
  212. except FileNotFoundError:
  213. # 删除视频文件夹
  214. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  215. Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
  216. return
  217. # 下载封面
  218. Common.download_method(log_type=log_type, crawler=crawler, text='cover',
  219. title=video_dict['video_title'], url=video_dict['cover_url'])
  220. # 保存视频信息至txt
  221. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  222. # 上传视频
  223. Common.logger(log_type, crawler).info("开始上传视频...")
  224. our_video_id = Publish.upload_and_publish(log_type=log_type,
  225. crawler=crawler,
  226. strategy=strategy,
  227. our_uid=our_uid,
  228. env=env,
  229. oss_endpoint=oss_endpoint)
  230. if env == 'dev':
  231. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  232. else:
  233. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  234. Common.logger(log_type, crawler).info("视频上传完成")
  235. if our_video_id is None:
  236. try:
  237. # 删除视频文件夹
  238. shutil.rmtree(f"./{crawler}/videos/{md_title}")
  239. return
  240. except FileNotFoundError:
  241. return
  242. # 视频写入飞书
  243. Feishu.insert_columns(log_type, 'douyin', "82c8d9", "ROWS", 1, 2)
  244. upload_time = int(time.time())
  245. values = [[
  246. our_video_id,
  247. time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  248. "推荐",
  249. str(video_dict['video_id']),
  250. video_dict['video_title'],
  251. our_video_link,
  252. video_dict['play_cnt'],
  253. video_dict['comment_cnt'],
  254. video_dict['like_cnt'],
  255. video_dict['share_cnt'],
  256. video_dict['duration'],
  257. str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
  258. video_dict['publish_time_str'],
  259. video_dict['user_name'],
  260. video_dict['user_id'],
  261. video_dict['avatar_url'],
  262. video_dict['cover_url'],
  263. video_dict['video_url']
  264. ]]
  265. time.sleep(0.5)
  266. Feishu.update_values(log_type, 'douyin', "82c8d9", "A2:Z2", values)
  267. Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
  268. # 视频信息保存数据库
  269. insert_sql = f""" insert into crawler_video(video_id,
  270. user_id,
  271. out_user_id,
  272. platform,
  273. strategy,
  274. out_video_id,
  275. video_title,
  276. cover_url,
  277. video_url,
  278. duration,
  279. publish_time,
  280. play_cnt,
  281. comment_cnt,
  282. like_cnt,
  283. share_cnt,
  284. crawler_rule,
  285. width,
  286. height)
  287. values({our_video_id},
  288. {our_uid},
  289. "{video_dict['user_id']}",
  290. "{cls.platform}",
  291. "推荐爬虫策略",
  292. "{video_dict['video_id']}",
  293. "{video_dict['video_title']}",
  294. "{video_dict['cover_url']}",
  295. "{video_dict['video_url']}",
  296. {int(video_dict['duration'])},
  297. "{video_dict['publish_time_str']}",
  298. {int(video_dict['play_cnt'])},
  299. {int(video_dict['comment_cnt'])},
  300. {int(video_dict['like_cnt'])},
  301. {int(video_dict['share_cnt'])},
  302. '{json.dumps(rule_dict)}',
  303. {int(video_dict['video_width'])},
  304. {int(video_dict['video_height'])}) """
  305. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  306. MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
  307. Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
  308. if __name__ == '__main__':
  309. # DyRecommend.get_recommend('recommend','douyin','推荐抓取策略', 'inner','prod', 'aliyun')
  310. DyRecommend.get_videolist('recommend', 'douyin', '推荐抓取策略', 6282709, 'outer', 'dev', 'aliyun')