|
@@ -15,18 +15,40 @@ from hashlib import md5
|
|
|
import requests
|
|
|
import urllib3
|
|
|
from requests.adapters import HTTPAdapter
|
|
|
+
|
|
|
sys.path.append(os.getcwd())
|
|
|
-from common.scheduling_db import MysqlHelper
|
|
|
+from common.scheduling_db import MysqlHelper
|
|
|
from common.common import Common
|
|
|
from common.feishu import Feishu
|
|
|
from common.publish import Publish
|
|
|
+from common.public import get_user_from_mysql, get_config_from_mysql, download_rule
|
|
|
|
|
|
|
|
|
-class SchedulingFollow:
|
|
|
+class ScheduleXiguaFollow:
|
|
|
# 个人主页视频翻页参数
|
|
|
offset = 0
|
|
|
platform = "西瓜视频"
|
|
|
|
|
|
+ @classmethod
|
|
|
+ def download_rule(cls, video_info_dict, rule_dict):
|
|
|
+ if video_info_dict['play_cnt'] >= rule_dict['play_cnt']['min']:
|
|
|
+ if video_info_dict['comment_cnt'] >= rule_dict['comment_cnt']['min']:
|
|
|
+ if video_info_dict['like_cnt'] >= rule_dict['like_cnt']['min']:
|
|
|
+ if video_info_dict['duration'] >= rule_dict['duration']['min']:
|
|
|
+ if video_info_dict['video_width'] >= rule_dict['width']['min'] \
|
|
|
+ or video_info_dict['video_height'] >= rule_dict['height']['min']:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
@classmethod
|
|
|
def get_users(cls, log_type, crawler, task, env):
|
|
|
link_list = task['spider_link']
|
|
@@ -50,64 +72,6 @@ class SchedulingFollow:
|
|
|
Common.logger(log_type, crawler).info(f"user_list:{user_list}")
|
|
|
return user_list
|
|
|
|
|
|
- # 下载规则
|
|
|
- @classmethod
|
|
|
- def download_rule_scheduling(cls, video_info_dict, task):
|
|
|
- try:
|
|
|
- play_cnt_min = int(task['play_cnt']['min'])
|
|
|
- except:
|
|
|
- play_cnt_min = 0
|
|
|
-
|
|
|
- try:
|
|
|
- video_like_min = int(task['video_like']['min'])
|
|
|
- except:
|
|
|
- video_like_min = 0
|
|
|
-
|
|
|
- try:
|
|
|
- share_cnt_min = int(task['share_cnt']['min'])
|
|
|
- except:
|
|
|
- share_cnt_min = 0
|
|
|
-
|
|
|
- try:
|
|
|
- video_width_min = int(task['video_width']['min'])
|
|
|
- except:
|
|
|
- video_width_min = 0
|
|
|
-
|
|
|
- try:
|
|
|
- video_height_min = task['video_height']['min']
|
|
|
- except:
|
|
|
- video_height_min = 0
|
|
|
-
|
|
|
- try:
|
|
|
- duration_min = int(task['duration_min'])
|
|
|
- except:
|
|
|
- duration_min = 0
|
|
|
-
|
|
|
- try:
|
|
|
- duration_max = int(task['duration_max'])
|
|
|
- except:
|
|
|
- duration_max = 1000000000
|
|
|
-
|
|
|
- if int(video_info_dict['play_cnt']) >= play_cnt_min:
|
|
|
- if int(video_info_dict['like_cnt']) >= video_like_min:
|
|
|
- if int(video_info_dict['share_cnt']) >= share_cnt_min:
|
|
|
- if duration_max >= int(video_info_dict['duration']) >= duration_min:
|
|
|
- if int(video_info_dict['video_width']) >= video_width_min:
|
|
|
- if int(video_info_dict['video_height']) >= video_height_min:
|
|
|
- return True
|
|
|
- else:
|
|
|
- return False
|
|
|
- else:
|
|
|
- return False
|
|
|
- else:
|
|
|
- return False
|
|
|
- else:
|
|
|
- return False
|
|
|
- else:
|
|
|
- return False
|
|
|
- else:
|
|
|
- return False
|
|
|
-
|
|
|
# 过滤词库
|
|
|
@classmethod
|
|
|
def filter_words(cls, log_type, crawler):
|
|
@@ -197,7 +161,8 @@ class SchedulingFollow:
|
|
|
# max_retries=3 重试3次
|
|
|
s.mount('http://', HTTPAdapter(max_retries=3))
|
|
|
s.mount('https://', HTTPAdapter(max_retries=3))
|
|
|
- response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False, proxies=Common.tunnel_proxies(), timeout=5)
|
|
|
+ response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False,
|
|
|
+ proxies=Common.tunnel_proxies(), timeout=5)
|
|
|
response.close()
|
|
|
if 'data' not in response.json() or response.json()['data'] == '':
|
|
|
Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
|
|
@@ -212,7 +177,8 @@ class SchedulingFollow:
|
|
|
video_url_dict["video_height"] = 0
|
|
|
|
|
|
elif 'dash_120fps' in video_info['videoResource']:
|
|
|
- if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in video_info['videoResource']['dash_120fps']['video_list']:
|
|
|
+ if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in \
|
|
|
+ video_info['videoResource']['dash_120fps']['video_list']:
|
|
|
video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
|
|
|
audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
@@ -231,7 +197,8 @@ class SchedulingFollow:
|
|
|
video_url_dict["audio_url"] = audio_url
|
|
|
video_url_dict["video_width"] = video_width
|
|
|
video_url_dict["video_height"] = video_height
|
|
|
- elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in video_info['videoResource']['dash_120fps']['video_list']:
|
|
|
+ elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in \
|
|
|
+ video_info['videoResource']['dash_120fps']['video_list']:
|
|
|
video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
|
|
|
audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
@@ -250,7 +217,8 @@ class SchedulingFollow:
|
|
|
video_url_dict["audio_url"] = audio_url
|
|
|
video_url_dict["video_width"] = video_width
|
|
|
video_url_dict["video_height"] = video_height
|
|
|
- elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in video_info['videoResource']['dash_120fps']['video_list']:
|
|
|
+ elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in \
|
|
|
+ video_info['videoResource']['dash_120fps']['video_list']:
|
|
|
video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
|
|
|
audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
@@ -269,7 +237,8 @@ class SchedulingFollow:
|
|
|
video_url_dict["audio_url"] = audio_url
|
|
|
video_url_dict["video_width"] = video_width
|
|
|
video_url_dict["video_height"] = video_height
|
|
|
- elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in video_info['videoResource']['dash_120fps']['video_list']:
|
|
|
+ elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in \
|
|
|
+ video_info['videoResource']['dash_120fps']['video_list']:
|
|
|
video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
|
|
|
audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
@@ -292,11 +261,17 @@ class SchedulingFollow:
|
|
|
elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
|
|
|
and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
|
|
|
and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
|
|
|
- and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
|
|
|
- and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
|
|
|
-
|
|
|
- video_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
|
|
|
- audio_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
|
|
|
+ and len(
|
|
|
+ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
|
|
|
+ and len(
|
|
|
+ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
|
|
|
+
|
|
|
+ video_url = \
|
|
|
+ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
|
|
|
+ 'backup_url_1']
|
|
|
+ audio_url = \
|
|
|
+ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][
|
|
|
+ 'backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
|
video_url += '=='
|
|
|
elif len(video_url) % 3 == 2:
|
|
@@ -307,8 +282,12 @@ class SchedulingFollow:
|
|
|
audio_url += '='
|
|
|
video_url = base64.b64decode(video_url).decode('utf8')
|
|
|
audio_url = base64.b64decode(audio_url).decode('utf8')
|
|
|
- video_width = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
|
|
|
- video_height = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight']
|
|
|
+ video_width = \
|
|
|
+ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
|
|
|
+ 'vwidth']
|
|
|
+ video_height = \
|
|
|
+ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][
|
|
|
+ 'vheight']
|
|
|
video_url_dict["video_url"] = video_url
|
|
|
video_url_dict["audio_url"] = audio_url
|
|
|
video_url_dict["video_width"] = video_width
|
|
@@ -320,7 +299,8 @@ class SchedulingFollow:
|
|
|
video_url_dict["video_height"] = 0
|
|
|
|
|
|
elif 'dash' in video_info['videoResource']:
|
|
|
- if "video_list" in video_info['videoResource']['dash'] and 'video_4' in video_info['videoResource']['dash']['video_list']:
|
|
|
+ if "video_list" in video_info['videoResource']['dash'] and 'video_4' in \
|
|
|
+ video_info['videoResource']['dash']['video_list']:
|
|
|
video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
|
|
|
audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
@@ -339,7 +319,8 @@ class SchedulingFollow:
|
|
|
video_url_dict["audio_url"] = audio_url
|
|
|
video_url_dict["video_width"] = video_width
|
|
|
video_url_dict["video_height"] = video_height
|
|
|
- elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in video_info['videoResource']['dash']['video_list']:
|
|
|
+ elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in \
|
|
|
+ video_info['videoResource']['dash']['video_list']:
|
|
|
video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
|
|
|
audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
@@ -358,7 +339,8 @@ class SchedulingFollow:
|
|
|
video_url_dict["audio_url"] = audio_url
|
|
|
video_url_dict["video_width"] = video_width
|
|
|
video_url_dict["video_height"] = video_height
|
|
|
- elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in video_info['videoResource']['dash']['video_list']:
|
|
|
+ elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in \
|
|
|
+ video_info['videoResource']['dash']['video_list']:
|
|
|
video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
|
|
|
audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
@@ -377,7 +359,8 @@ class SchedulingFollow:
|
|
|
video_url_dict["audio_url"] = audio_url
|
|
|
video_url_dict["video_width"] = video_width
|
|
|
video_url_dict["video_height"] = video_height
|
|
|
- elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in video_info['videoResource']['dash']['video_list']:
|
|
|
+ elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in \
|
|
|
+ video_info['videoResource']['dash']['video_list']:
|
|
|
video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
|
|
|
audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
@@ -403,8 +386,10 @@ class SchedulingFollow:
|
|
|
and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \
|
|
|
and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0:
|
|
|
|
|
|
- video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
|
|
|
- audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
|
|
|
+ video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
|
|
|
+ 'backup_url_1']
|
|
|
+ audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1][
|
|
|
+ 'backup_url_1']
|
|
|
if len(video_url) % 3 == 1:
|
|
|
video_url += '=='
|
|
|
elif len(video_url) % 3 == 2:
|
|
@@ -415,8 +400,10 @@ class SchedulingFollow:
|
|
|
audio_url += '='
|
|
|
video_url = base64.b64decode(video_url).decode('utf8')
|
|
|
audio_url = base64.b64decode(audio_url).decode('utf8')
|
|
|
- video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
|
|
|
- video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']
|
|
|
+ video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
|
|
|
+ 'vwidth']
|
|
|
+ video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][
|
|
|
+ 'vheight']
|
|
|
video_url_dict["video_url"] = video_url
|
|
|
video_url_dict["audio_url"] = audio_url
|
|
|
video_url_dict["video_width"] = video_width
|
|
@@ -555,7 +542,7 @@ class SchedulingFollow:
|
|
|
Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
|
|
|
|
|
|
@classmethod
|
|
|
- def get_videolist(cls, log_type, crawler, task, our_uid, out_uid, oss_endpoint, env):
|
|
|
+ def get_videolist(cls, log_type, crawler, strategy, task, our_uid, out_uid, oss_endpoint, env):
|
|
|
try:
|
|
|
signature = cls.random_signature()
|
|
|
while True:
|
|
@@ -567,6 +554,8 @@ class SchedulingFollow:
|
|
|
'maxBehotTime': '0',
|
|
|
'order': 'new',
|
|
|
'isHome': '0',
|
|
|
+ # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
|
|
|
+ # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
|
|
|
'_signature': signature,
|
|
|
}
|
|
|
headers = {
|
|
@@ -578,7 +567,8 @@ class SchedulingFollow:
|
|
|
# max_retries=3 重试3次
|
|
|
s.mount('http://', HTTPAdapter(max_retries=3))
|
|
|
s.mount('https://', HTTPAdapter(max_retries=3))
|
|
|
- response = s.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5)
|
|
|
+ response = s.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False,
|
|
|
+ timeout=5)
|
|
|
response.close()
|
|
|
cls.offset += 30
|
|
|
if response.status_code != 200:
|
|
@@ -589,7 +579,7 @@ class SchedulingFollow:
|
|
|
Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
|
|
|
cls.offset = 0
|
|
|
return
|
|
|
- elif 'videoList' not in response.json()["data"]:
|
|
|
+ elif not response.json()["data"]['videoList']:
|
|
|
Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
|
|
|
cls.offset = 0
|
|
|
return
|
|
@@ -601,7 +591,7 @@ class SchedulingFollow:
|
|
|
video_title = 0
|
|
|
else:
|
|
|
video_title = videoList[i]['title'].strip().replace('手游', '') \
|
|
|
- .replace('/', '').replace('\/', '').replace('\n', '')
|
|
|
+ .replace('/', '').replace('\/', '').replace('\n', '').replace('"', '').replace("'", '')
|
|
|
|
|
|
# video_id
|
|
|
if 'video_id' not in videoList[i]:
|
|
@@ -690,64 +680,44 @@ class SchedulingFollow:
|
|
|
elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']:
|
|
|
cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url']
|
|
|
else:
|
|
|
- cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url']
|
|
|
-
|
|
|
- min_publish_time = int(task["min_publish_time"])
|
|
|
- min_publish_day = int(task["min_publish_day"])
|
|
|
- min_publish_day = (date.today() + timedelta(days=-min_publish_day)).strftime("%Y-%m-%d")
|
|
|
- min_publish_day = int(time.mktime(time.strptime(min_publish_day, "%Y-%m-%d")))
|
|
|
- if min_publish_time > 0 and min_publish_day > 0:
|
|
|
- publish_time_rule = min_publish_time
|
|
|
- elif min_publish_time > 0:
|
|
|
- publish_time_rule = min_publish_time
|
|
|
- else:
|
|
|
- publish_time_rule = min_publish_day
|
|
|
-
|
|
|
- if gid == 0 or video_id == 0 or cover_url == 0:
|
|
|
- Common.logger(log_type, crawler).info('无效视频\n')
|
|
|
- elif is_top is True and int(publish_time) < publish_time_rule:
|
|
|
- Common.logger(log_type, crawler).info(f'置顶视频,且发布时间超过抓取时间\n')
|
|
|
- elif int(publish_time) < publish_time_rule:
|
|
|
- Common.logger(log_type, crawler).info(f'发布时间超过抓取时间\n')
|
|
|
- cls.offset = 0
|
|
|
- return
|
|
|
- else:
|
|
|
- video_url_dict = cls.get_video_url(log_type, crawler, gid)
|
|
|
- video_url = video_url_dict["video_url"]
|
|
|
- audio_url = video_url_dict["audio_url"]
|
|
|
- video_width = video_url_dict["video_width"]
|
|
|
- video_height = video_url_dict["video_height"]
|
|
|
-
|
|
|
- video_dict = {'video_title': video_title,
|
|
|
- 'video_id': video_id,
|
|
|
- 'gid': gid,
|
|
|
- 'play_cnt': play_cnt,
|
|
|
- 'comment_cnt': comment_cnt,
|
|
|
- 'like_cnt': like_cnt,
|
|
|
- 'share_cnt': share_cnt,
|
|
|
- 'video_width': video_width,
|
|
|
- 'video_height': video_height,
|
|
|
- 'duration': video_duration,
|
|
|
- 'publish_time_stamp': publish_time,
|
|
|
- 'publish_time_str': publish_time_str,
|
|
|
- 'is_top': is_top,
|
|
|
- 'user_name': user_name,
|
|
|
- 'user_id': user_id,
|
|
|
- 'avatar_url': avatar_url,
|
|
|
- 'cover_url': cover_url,
|
|
|
- 'audio_url': audio_url,
|
|
|
- 'video_url': video_url,
|
|
|
- 'session': signature}
|
|
|
- for k, v in video_dict.items():
|
|
|
- Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
- cls.download_publish(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- video_dict=video_dict,
|
|
|
- task=task,
|
|
|
- strategy=task["task_name"],
|
|
|
- our_uid=our_uid,
|
|
|
- oss_endpoint=oss_endpoint,
|
|
|
- env=env)
|
|
|
+ cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0][
|
|
|
+ 'url']
|
|
|
+ video_url_dict = cls.get_video_url(log_type, crawler, gid)
|
|
|
+ video_url = video_url_dict["video_url"]
|
|
|
+ audio_url = video_url_dict["audio_url"]
|
|
|
+ video_width = video_url_dict["video_width"]
|
|
|
+ video_height = video_url_dict["video_height"]
|
|
|
+
|
|
|
+ video_dict = {'video_title': video_title,
|
|
|
+ 'video_id': video_id,
|
|
|
+ 'gid': gid,
|
|
|
+ 'play_cnt': play_cnt,
|
|
|
+ 'comment_cnt': comment_cnt,
|
|
|
+ 'like_cnt': like_cnt,
|
|
|
+ 'share_cnt': share_cnt,
|
|
|
+ 'video_width': video_width,
|
|
|
+ 'video_height': video_height,
|
|
|
+ 'duration': video_duration,
|
|
|
+ 'publish_time_stamp': publish_time,
|
|
|
+ 'publish_time_str': publish_time_str,
|
|
|
+ 'is_top': is_top,
|
|
|
+ 'user_name': user_name,
|
|
|
+ 'user_id': user_id,
|
|
|
+ 'avatar_url': avatar_url,
|
|
|
+ 'cover_url': cover_url,
|
|
|
+ 'audio_url': audio_url,
|
|
|
+ 'video_url': video_url,
|
|
|
+ 'session': signature}
|
|
|
+ for k, v in video_dict.items():
|
|
|
+ Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ cls.download_publish(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ strategy=strategy,
|
|
|
+ video_dict=video_dict,
|
|
|
+ task=task,
|
|
|
+ our_uid=our_uid,
|
|
|
+ oss_endpoint=oss_endpoint,
|
|
|
+ env=env)
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).error(f"get_videolist:{e}\n")
|
|
|
|
|
@@ -761,19 +731,26 @@ class SchedulingFollow:
|
|
|
@classmethod
|
|
|
def download_publish(cls, log_type, crawler, strategy, video_dict, task, our_uid, oss_endpoint, env):
|
|
|
try:
|
|
|
- if cls.download_rule_scheduling(video_dict, task) is False:
|
|
|
+ filter_words = get_config_from_mysql(log_type, crawler, env, text='filter')
|
|
|
+ for filter_word in filter_words:
|
|
|
+ if filter_word in video_dict['video_title']:
|
|
|
+ Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
+ return
|
|
|
+ if download_rule(log_type, crawler, video_dict, task['rule_dict']) is False:
|
|
|
Common.logger(log_type, crawler).info('不满足抓取规则\n')
|
|
|
- elif any(word if word in video_dict['video_title'] else False for word in cls.filter_words(log_type, crawler)) is True:
|
|
|
- Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
|
|
|
+
|
|
|
elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env) != 0:
|
|
|
Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
else:
|
|
|
# 下载视频
|
|
|
- Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video', title=video_dict['video_title'], url=video_dict['video_url'])
|
|
|
+ Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
|
|
|
+ title=video_dict['video_title'], url=video_dict['video_url'])
|
|
|
# 下载音频
|
|
|
- Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio', title=video_dict['video_title'], url=video_dict['audio_url'])
|
|
|
+ Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio',
|
|
|
+ title=video_dict['video_title'], url=video_dict['audio_url'])
|
|
|
# 合成音视频
|
|
|
- Common.video_compose(log_type=log_type, crawler=crawler, video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
+ Common.video_compose(log_type=log_type, crawler=crawler,
|
|
|
+ video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
|
|
|
if os.path.getsize(f"./{crawler}/videos/{md_title}/video.mp4") == 0:
|
|
|
# 删除视频文件夹
|
|
@@ -787,7 +764,8 @@ class SchedulingFollow:
|
|
|
# shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
# return
|
|
|
# 下载封面
|
|
|
- Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
|
|
|
+ Common.download_method(log_type=log_type, crawler=crawler, text='cover',
|
|
|
+ title=video_dict['video_title'], url=video_dict['cover_url'])
|
|
|
# 保存视频信息至txt
|
|
|
Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
|
|
|
|
|
@@ -884,29 +862,26 @@ class SchedulingFollow:
|
|
|
|
|
|
@classmethod
|
|
|
def get_follow_videos(cls, log_type, crawler, task, oss_endpoint, env):
|
|
|
- try:
|
|
|
- user_list = cls.get_users(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- task=task,
|
|
|
- env=env)
|
|
|
- for user in user_list:
|
|
|
- out_uid = user["out_uid"]
|
|
|
- our_uid = int(user["our_uid"])
|
|
|
- if our_uid == 0:
|
|
|
- pass
|
|
|
- else:
|
|
|
- Common.logger(log_type, crawler).info(f"开始抓取 {out_uid} 用户主页视频\n")
|
|
|
- cls.get_videolist(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- task=task,
|
|
|
- our_uid=our_uid,
|
|
|
- out_uid=out_uid,
|
|
|
- oss_endpoint=oss_endpoint,
|
|
|
- env=env)
|
|
|
- cls.offset = 0
|
|
|
- time.sleep(1)
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f"get_follow_videos:{e}\n")
|
|
|
+ user_list = get_user_from_mysql(log_type, crawler, crawler, env)
|
|
|
+ strategy = '定向抓取策略'
|
|
|
+ for user in user_list:
|
|
|
+ try:
|
|
|
+ spider_link = user["link"]
|
|
|
+ out_uid = spider_link.split('/')[-1]
|
|
|
+ user_name = user["nick_name"]
|
|
|
+ our_uid = user["uid"]
|
|
|
+ Common.logger(log_type, crawler).info(f"开始抓取 {user_name} 用户主页视频\n")
|
|
|
+ cls.get_videolist(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ strategy=strategy,
|
|
|
+ task=task,
|
|
|
+ our_uid=our_uid,
|
|
|
+ out_uid=out_uid,
|
|
|
+ oss_endpoint=oss_endpoint,
|
|
|
+ env=env)
|
|
|
+ cls.offset = 0
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"get_follow_videos:{e}\n")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
@@ -916,5 +891,5 @@ if __name__ == '__main__':
|
|
|
# env="dev",
|
|
|
# machine="local")
|
|
|
|
|
|
- print(SchedulingFollow.repeat_video("follow", "xigua", "v0201ag10000ce3jcjbc77u8jsplpgrg", "dev"))
|
|
|
+ print(ScheduleXiguaFollow.repeat_video("follow", "xigua", "v0201ag10000ce3jcjbc77u8jsplpgrg", "dev"))
|
|
|
pass
|