123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375 |
- # -*- coding: utf-8 -*-
- # @Time: 2023/11/07
- import os
- import random
- import sys
- import time
- import cv2
- import requests
- import json
- import urllib3
- sys.path.append(os.getcwd())
- from datetime import timedelta, date
- from common.common import Common
- from common import AliyunLogger
- from common.mq import MQ
- from requests.adapters import HTTPAdapter
- from common.scheduling_db import MysqlHelper
- from common.public import get_config_from_mysql, download_rule
- from douyin.douyin_author.douyin_author_scheduling_help import DouYinHelper
- from common.limit import AuthorLimit
- class DouyinauthorScheduling:
- platform = "抖音"
- download_cnt = 0
- limiter = AuthorLimit(platform="douyin", mode="author")
- @classmethod
- def videos_cnt(cls, rule_dict):
- videos_cnt = rule_dict.get("videos_cnt", {}).get("min", 0)
- if videos_cnt == 0:
- videos_cnt = 1000
- return videos_cnt
- @classmethod
- def get_cookie(cls, log_type, crawler, env):
- select_sql = f""" select * from crawler_config where source="{crawler}" """
- configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
- for config in configs:
- if "cookie" in config["config"]:
- cookie_dict = {
- "cookie_id": config["id"],
- "title": config["title"].strip(),
- "cookie": dict(eval(config["config"]))["cookie"].strip(),
- "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(config["update_time"] / 1000))),
- "operator": config["operator"].strip()
- }
- return cookie_dict
- @classmethod
- def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
- mq = MQ(topic_name="topic_crawler_etl_" + env)
- next_cursor = 0
- special = 0
- for i in range(3):
- # while True:
- flag = user_dict["link"].split("_")[0]
- if flag == "V1":
- special = 0.01
- rule_dict = {
- 'period': {"min": 15, "max": 0},
- }
- elif flag == "V2":
- special = 0.01
- rule_dict = {
- 'period': {"min": 15, "max": 0},
- }
- elif flag == "V3":
- special = 0.01
- rule_dict = {
- 'period': {"min": 15, "max": 0},
- }
- cookie = cls.get_cookie(log_type, crawler, env)["cookie"]
- if user_dict['link'][0] == "V":
- link = user_dict["link"][3:]
- else:
- link = user_dict["link"]
- time.sleep(random.randint(5, 10))
- url = 'https://www.douyin.com/aweme/v1/web/aweme/post/'
- account_id = link
- headers = {
- 'Accept': 'application/json, text/plain, */*',
- 'Accept-Language': 'zh-CN,zh;q=0.9',
- 'Cache-Control': 'no-cache',
- 'Cookie': cookie,
- # 'Cookie': "ttwid=1%7Cyj16cpJ4yxvUv9QWry1Uz3MoZ3Ci7FHGQR4qW3W70ac%7C1704436922%7C435637f1aa3c55cbed4587acf02003b5d74cfcac945a0df3893e041a288ce3c1; bd_ticket_guard_client_web_domain=2; passport_assist_user=CjzcKtls0e65w_tjpggJoAB9du8ZDR8XRxt178-cIHsJhxCRZPLxqAj0PHWKZ4g2xmxWzTHsK7mi4vxt0lcaSgo8W1SZlyQoj2vcxlToyotQ902cRuWULW6HqkHEJHMRcIoo_Y7maHi82HqNSTCVE5xBSQnTOXW31hxsJ4EIENPsxQ0Yia_WVCABIgED619Mew%3D%3D; n_mh=uPso8EqWH8OYYER0xnVFOgB1e9TbTzK9J1CBmr4IQVA; sso_uid_tt=f829ccc6652eae601ff8e56da1fccdb5; sso_uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; toutiao_sso_user=d2fa09f7626319fb35fd2553b5ec5b76; toutiao_sso_user_ss=d2fa09f7626319fb35fd2553b5ec5b76; LOGIN_STATUS=1; store-region=cn-hn; store-region-src=uid; d_ticket=dd5890b4b8f873453c1f1a090b9aa6ccb205c; sid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; ssid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; dy_swidth=1449; dy_sheight=906; __live_version__=%221.1.1.8009%22; live_use_vvc=%22false%22; xgplayer_user_id=510446933624; uid_tt=f829ccc6652eae601ff8e56da1fccdb5; uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; sid_tt=d2fa09f7626319fb35fd2553b5ec5b76; sessionid=d2fa09f7626319fb35fd2553b5ec5b76; sessionid_ss=d2fa09f7626319fb35fd2553b5ec5b76; passport_csrf_token=34235b71f9c981e07032bd9041848f1e; passport_csrf_token_default=34235b71f9c981e07032bd9041848f1e; download_guide=%223%2F20240313%2F1%22; publish_badge_show_info=%220%2C0%2C0%2C1710488198823%22; EnhanceDownloadGuide=%220_0_0_0_2_1710734032%22; sid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; ssid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; sid_guard=d2fa09f7626319fb35fd2553b5ec5b76%7C1710841689%7C5184001%7CSat%2C+18-May-2024+09%3A48%3A10+GMT; __ac_nonce=065fb9e6800dd2b1e14b1; __ac_signature=_02B4Z6wo00f01l39XCgAAIDBYFRGtw.YiKZd3ViAAPKTmnfg2zaxyzrXD6iNtRPPtcoSm5zbE6snYTcix8FTXgxsxQK195O6vG-zEOdZqKTq-ouYFPANlN1Jmu1.ZxBLTzOstKAOorrHEYQN06; douyin.com; xg_device_score=7.654580937785368; device_web_cpu_core=16; device_web_memory_size=8; architecture=amd64; csrf_session_id=f524ab33e8de0e4e922d8b48c362e6c1; strategyABtestKey=%221710988910.691%22; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.281%7D; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A1%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A1%7D%22; my_rd=2; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710989305107%2F0%22; SEARCH_RESULT_LIST_TYPE=%22single%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1449%2C%5C%22screen_height%5C%22%3A906%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A50%7D%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCT05aUWpZcmNjMWhVYXhidlVoUG9uUC9lV0phYzBNbnhTQldxUmZESGFZQ290cUhOSE1GdmJ2ZTdSY1REdVpiemdHUU82cS90dWhzNVdnTmxaeVR3TzQ9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; tt_scid=IMdpIr4sRF90L0IaD0TdSlOy0Sm1rX-hlw5-OAxNAcisxsztezRzg3356KIGHx4cee78; pwa2=%220%7C0%7C1%7C0%22; odin_tt=f05b7460c2544b994f5deae19a5bbf0828870c64564040ef36c9d7cb40da9e44bc41ee52b1cac76d042b80fc4dcb4394; msToken=Tq7-Wv99mG0yhHDIz7-R1fxSAQyf8R7dNAvHMxnjrbWpbi531L8TI6VdQhQSDTAl8jQQJr9IWhJpbRu3E01IgC5uQ7DE_5oGYW046WpPb_bjluz255YhMdqfJ3Qmeg==; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710990128301%2F0%22; msToken=YstZKHMONS09-8nDsHM40jwWV2nr5E1wYmv7cBeAmeY02prkpNLjRwB8C3tp52nc1hxvL5R1F-hkmvDSc0TNeNxz-DNodK3GMV8dK3gkVT8DVPKeVL5umskY5Am5; passport_fe_beating_status=false; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22",
- 'Pragma': 'no-cache',
- 'Referer': f'https://www.douyin.com/user/{account_id}',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/118.0.0.0 Safari/537.36',
- }
- query = DouYinHelper.get_full_query(ua=headers['User-Agent'], extra_data={
- 'sec_user_id': account_id,
- 'max_cursor': next_cursor,
- 'locate_query': 'false',
- 'show_live_replay_strategy': '1',
- 'need_time_list': '1',
- 'time_list_query': '0',
- 'whale_cut_token': '',
- 'cut_version': '1',
- 'count': '18',
- 'publish_video_strategy_type': '2',
- })
- urllib3.disable_warnings()
- s = requests.session()
- # max_retries=3 重试3次
- s.mount('http://', HTTPAdapter(max_retries=3))
- s.mount('https://', HTTPAdapter(max_retries=3))
- response = requests.request(method='GET', url=url, headers=headers, params=query)
- body = response.content.decode()
- obj = json.loads(body)
- has_more = True if obj.get('has_more', 0) == 1 else False
- next_cursor = str(obj.get('max_cursor')) if has_more else None
- data = obj.get('aweme_list', [])
- response.close()
- if response.status_code != 200:
- Common.logger(log_type, crawler).warning(f"data:{data}\n")
- AliyunLogger.logging(
- code="2000",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f"data:{data}\n"
- )
- return
- elif len(data) == 0:
- Common.logger(log_type, crawler).warning(f"没有更多视频啦 ~\n")
- AliyunLogger.logging(
- code="2001",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f"没有更多视频啦 ~\n"
- )
- return
- for i in range(len(data)):
- try:
- entity_type = data[i].get('search_impr').get('entity_type')
- if entity_type == 'GENERAL':
- Common.logger(log_type, crawler).info('扫描到一条视频\n')
- AliyunLogger.logging(
- code="1001",
- platform=crawler,
- mode=log_type,
- env=env,
- message='扫描到一条视频\n'
- )
- is_top = data[i].get('is_top') # 是否置顶
- video_id = data[i].get('aweme_id') # 文章id
- video_title = data[i].get('desc', "").strip().replace("\n", "") \
- .replace("/", "").replace("\\", "").replace("\r", "") \
- .replace(":", "").replace("*", "").replace("?", "") \
- .replace("?", "").replace('"', "").replace("<", "") \
- .replace(">", "").replace("|", "").replace(" ", "") \
- .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
- .replace("'", "").replace("#", "").replace("Merge", "")
- publish_time_stamp = data[i].get('create_time') # 发布时间
- publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
- # video_url = data[i].get('video').get('play_addr').get('url_list')[0] # 视频链接
- video_uri = data[i].get('video', {}).get('play_addr', {}).get('uri')
- ratio = f'{data[i].get("video", {}).get("height")}p'
- video_url = f'https://www.iesdouyin.com/aweme/v1/play/?video_id={video_uri}&ratio={ratio}&line=0' # 视频链接
- cover_url = data[i].get('video').get('cover').get('url_list')[0] # 视频封面
- digg_count = int(data[i].get('statistics').get('digg_count')) # 点赞
- comment_count = int(data[i].get('statistics').get('comment_count')) # 评论
- # collect_count = data[i].get('statistics').get('collect_count') # 收藏
- share_count = int(data[i].get('statistics').get('share_count')) # 转发
- if share_count < 500:
- AliyunLogger.logging(
- code="2004",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f'分享小于500\n'
- )
- continue
- video_percent = '%.2f' % (share_count / digg_count)
- special = float(0.25)
- if float(video_percent) < special:
- AliyunLogger.logging(
- code="2004",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f'分享/点赞小于25%\n'
- )
- continue
- duration = cls.video_duration(video_url)
- if int(duration) < 45:
- AliyunLogger.logging(
- code="2004",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f'视频时常小于45秒\n'
- )
- continue
- # if special != 0:
- # if share_count != 0:
- # video_percent = '%.2f' % (share_count / digg_count)
- # special = float(special)
- # if float(video_percent) < special:
- # Common.logger(log_type, crawler).info(f"不符合条件:分享/点赞-{video_percent}\n")
- # AliyunLogger.logging(
- # code="2004",
- # platform=crawler,
- # mode=log_type,
- # env=env,
- # message=f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n"
- # )
- # continue
- video_dict = {'video_title': video_title,
- 'video_id': video_id,
- 'play_cnt': 0,
- 'like_cnt': digg_count,
- 'comment_cnt': comment_count,
- 'share_cnt': share_count,
- 'video_width': 0,
- 'video_height': 0,
- 'duration': 0,
- 'publish_time_stamp': publish_time_stamp,
- 'publish_time_str': publish_time_str,
- 'user_name': "douyin",
- 'user_id': video_id,
- 'avatar_url': '',
- 'cover_url': cover_url,
- 'video_url': video_url,
- 'session': f"douyin-{int(time.time())}"}
- for k, v in video_dict.items():
- Common.logger(log_type, crawler).info(f"{k}:{v}")
- AliyunLogger.logging(
- code="1000",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f"{video_dict}\n"
- )
- if is_top == 0:
- if int((int(time.time()) - int(publish_time_stamp)) / (3600*24)) > int(rule_dict.get("period", {}).get("max", 1000)):
- Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
- AliyunLogger.logging(
- code="2004",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n'
- )
- return
- if video_dict["video_id"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
- Common.logger(log_type, crawler).info('无效视频\n')
- AliyunLogger.logging(
- code="2004",
- platform=crawler,
- mode=log_type,
- env=env,
- message='无效视频\n'
- )
- elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
- Common.logger(log_type, crawler).info("不满足抓取规则\n")
- AliyunLogger.logging(
- code="2004",
- platform=crawler,
- mode=log_type,
- env=env,
- message='不满足抓取规则\n'
- )
- elif any(str(word) if str(word) in video_dict["video_title"] else False
- for word in get_config_from_mysql(log_type=log_type,
- source=crawler,
- env=env,
- text="filter",
- action="")) is True:
- Common.logger(log_type, crawler).info('已中过滤词\n')
- AliyunLogger.logging(
- code="2004",
- platform=crawler,
- mode=log_type,
- env=env,
- message='已中过滤词\n'
- )
- elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
- Common.logger(log_type, crawler).info('视频已下载\n')
- AliyunLogger.logging(
- code="2002",
- platform=crawler,
- mode=log_type,
- env=env,
- message='视频已下载\n'
- )
- else:
- video_dict["out_user_id"] = video_dict["user_id"]
- video_dict["platform"] = crawler
- video_dict["strategy"] = log_type
- video_dict["out_video_id"] = video_dict["video_id"]
- video_dict["width"] = video_dict["video_width"]
- video_dict["height"] = video_dict["video_height"]
- video_dict["crawler_rule"] = json.dumps(rule_dict)
- video_dict["user_id"] = user_dict["uid"]
- video_dict["publish_time"] = video_dict["publish_time_str"]
- video_dict["strategy_type"] = log_type
- limit_flag = cls.limiter.author_limitation(user_id=video_dict['user_id'])
- if limit_flag:
- mq.send_msg(video_dict)
- cls.download_cnt += 1
- AliyunLogger.logging(code="1002", message="成功发送至 ETL", data=video_dict)
- except Exception as e:
- Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
- AliyunLogger.logging(
- code="3000",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f"抓取单条视频异常:{e}\n"
- )
- @classmethod
- def repeat_video(cls, log_type, crawler, video_id, env):
- sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
- repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
- return len(repeat_video)
- @classmethod
- def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
- for user_dict in user_list:
- try:
- Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 主页视频")
- AliyunLogger.logging(
- code="2000",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f"开始抓取 {user_dict['nick_name']} 主页视频"
- )
- cls.download_cnt = 0
- cls.get_videoList(log_type=log_type,
- crawler=crawler,
- user_dict=user_dict,
- rule_dict=rule_dict,
- env=env)
- except Exception as e:
- Common.logger(log_type, crawler).warning(f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
- AliyunLogger.logging(
- code="3000",
- platform=crawler,
- mode=log_type,
- env=env,
- message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
- )
- @classmethod
- def video_duration(cls, filename):
- cap = cv2.VideoCapture(filename)
- if cap.isOpened():
- rate = cap.get(5)
- frame_num = cap.get(7)
- duration = frame_num / rate
- return duration
- return 0
- if __name__ == "__main__":
- print(DouyinauthorScheduling.get_cookie("author", "douyin", "prod")["cookie"])
- pass
|