|
@@ -101,10 +101,10 @@ class Search:
|
|
|
for i in range(1, len(user_sheet)):
|
|
|
our_uid = user_sheet[i][6]
|
|
|
search_word = user_sheet[i][4]
|
|
|
- storage = user_sheet[i][5]
|
|
|
tag1 = user_sheet[i][8]
|
|
|
tag2 = user_sheet[i][9]
|
|
|
tag3 = user_sheet[i][10]
|
|
|
+ tag4 = user_sheet[i][11]
|
|
|
Common.logger(log_type, crawler).info(f"正在更新 {search_word} 关键词信息\n")
|
|
|
if our_uid is None:
|
|
|
default_user = Users.get_default_user()
|
|
@@ -112,7 +112,7 @@ class Search:
|
|
|
user_dict = {
|
|
|
'nickName': default_user['nickName'],
|
|
|
'avatarUrl': default_user['avatarUrl'],
|
|
|
- 'tagName': f'{tag1},{tag2},{tag3}',
|
|
|
+ 'tagName': f'{tag1},{tag2},{tag3},{tag4}',
|
|
|
}
|
|
|
Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
|
|
|
our_uid = Users.create_uid(log_type, crawler, user_dict, env)
|
|
@@ -585,41 +585,37 @@ class Search:
|
|
|
|
|
|
@classmethod
|
|
|
def get_video_info(cls, log_type, crawler, item_id):
|
|
|
- try:
|
|
|
- d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
|
|
|
- "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
|
|
|
- "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
|
|
|
- "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
|
|
|
- "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
|
|
|
- video_id=item_id)
|
|
|
- res = requests.get(url=d_url, headers=random_user_agent('pc'), proxies=Common.tunnel_proxies())
|
|
|
- data = json.loads(res.text)['data']
|
|
|
- item_counter = data['h5_extra']['itemCell']['itemCounter']
|
|
|
- user_info = data['user_info']
|
|
|
- detail_info = data['video_detail_info']
|
|
|
- video_dict = {'video_title': data['title'],
|
|
|
- 'video_id': detail_info['video_id'],
|
|
|
- 'gid': data['group_id'],
|
|
|
- 'play_cnt': item_counter['videoWatchCount'],
|
|
|
- 'comment_cnt': item_counter['commentCount'],
|
|
|
- 'like_cnt': item_counter['diggCount'],
|
|
|
- 'share_cnt': item_counter['shareCount'],
|
|
|
-
|
|
|
-
|
|
|
- 'duration': data['video_duration'],
|
|
|
- 'publish_time_stamp': data['publish_time'],
|
|
|
- 'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
|
|
|
- time.localtime(data['publish_time'])),
|
|
|
- 'user_name': user_info['name'],
|
|
|
- 'user_id': user_info['user_id'],
|
|
|
- 'avatar_url': user_info['avatar_url'],
|
|
|
- 'cover_url': data['large_image']['url'].replace('\u0026', '&'),
|
|
|
-
|
|
|
- }
|
|
|
- return video_dict
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
|
|
|
- return {}
|
|
|
+ d_url = "http://a6.pstatp.com/article/full/11/1/{video_id}/{video_id}/1/0/?iid=3636030325&device_id=5787057242" \
|
|
|
+ "&ac=wifi&channel=wandoujia&aid=13&app_name=news_article&version_code=532&version_name=5.3.2&device_platform" \
|
|
|
+ "=android&ab_client=a1%2Cc2%2Ce1%2Cf2%2Cg2%2Cb3%2Cf4&abflag=3&ssmix=a&device_type=SM705" \
|
|
|
+ "&device_brand=smartisan&os_api=19&os_version=4.4.2&uuid=864593021012562&openudid=e23a5ff037ef2d1a" \
|
|
|
+ "&manifest_version_code=532&resolution=1080*1920&dpi=480&update_version_code=5320".format(
|
|
|
+ video_id=item_id)
|
|
|
+ res = requests.get(url=d_url, headers=random_user_agent('pc'), proxies=Common.tunnel_proxies())
|
|
|
+ data = json.loads(res.text)['data']
|
|
|
+ item_counter = data['h5_extra']['itemCell']['itemCounter']
|
|
|
+ user_info = data['user_info']
|
|
|
+ detail_info = data['video_detail_info']
|
|
|
+ video_dict = {'video_title': data['title'],
|
|
|
+ 'video_id': detail_info['video_id'],
|
|
|
+ 'gid': data['group_id'],
|
|
|
+ 'play_cnt': item_counter['videoWatchCount'],
|
|
|
+ 'comment_cnt': item_counter['commentCount'],
|
|
|
+ 'like_cnt': item_counter['diggCount'],
|
|
|
+ 'share_cnt': item_counter['shareCount'],
|
|
|
+
|
|
|
+ 'duration': data['video_duration'],
|
|
|
+ 'publish_time_stamp': data['publish_time'],
|
|
|
+ 'publish_time_str': time.strftime("%Y-%m-%d %H:%M:%S",
|
|
|
+ time.localtime(data['publish_time'])),
|
|
|
+ 'user_name': user_info['name'],
|
|
|
+ 'user_id': user_info['user_id'],
|
|
|
+ 'avatar_url': user_info['avatar_url'],
|
|
|
+ 'cover_url': data['large_image']['url'].replace('\u0026', '&'),
|
|
|
+
|
|
|
+ }
|
|
|
+ return video_dict
|
|
|
+
|
|
|
@classmethod
|
|
|
def is_ruled(cls, log_type, crawler, video_dict, rule_dict):
|
|
|
old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
|
|
@@ -634,7 +630,7 @@ class Search:
|
|
|
|
|
|
@classmethod
|
|
|
def get_videolist(cls, log_type, crawler, strategy, our_uid, search_word, oss_endpoint, env, machine):
|
|
|
- total_count = 0
|
|
|
+ total_count = 1
|
|
|
offset = 0
|
|
|
while True:
|
|
|
|
|
@@ -659,22 +655,21 @@ class Search:
|
|
|
for video_info in search_list:
|
|
|
v_type = video_info['type']
|
|
|
rule_dict = cls.get_rule(log_type, crawler)
|
|
|
- video_dict = {}
|
|
|
-
|
|
|
if v_type == 'video':
|
|
|
+ item_id = video_info['data']['group_id']
|
|
|
+ old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
|
|
|
+ if video_info['data']['publish_time'] <= old_time:
|
|
|
+ Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
+ continue
|
|
|
+ elif video_info['data']['video_watch_count'] <= rule_dict['play_cnt']:
|
|
|
+ Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
+ continue
|
|
|
+ elif video_info['data']['video_time'] < rule_dict['min_duration'] or video_info['data'][
|
|
|
+ 'video_time'] > rule_dict['max_duration']:
|
|
|
+ Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
+ continue
|
|
|
try:
|
|
|
- item_id = video_info['data']['group_id']
|
|
|
- old_time = int(time.time()) - (3600 * 24 * rule_dict['publish_time'])
|
|
|
- if video_info['data']['publish_time'] <= old_time:
|
|
|
- continue
|
|
|
- elif video_info['data']['video_watch_count'] <= rule_dict['play_cnt']:
|
|
|
- continue
|
|
|
- elif video_info['data']['video_time'] < rule_dict['min_duration'] or video_info['data']['video_time'] > rule_dict[
|
|
|
- 'max_duration']:
|
|
|
- continue
|
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
- if not video_dict:
|
|
|
- continue
|
|
|
video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
|
video_dict['video_width'] = video_url_dict["video_width"]
|
|
|
video_dict['video_height'] = video_url_dict["video_height"]
|
|
@@ -682,8 +677,36 @@ class Search:
|
|
|
video_dict['video_url'] = video_url_dict["video_url"]
|
|
|
video_dict['session'] = signature
|
|
|
except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
|
|
|
+ Common.logger(log_type, crawler).error(f'视频:{item_id},获取详情失败,原因:{e}')
|
|
|
continue
|
|
|
+ if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
|
+ Common.logger(log_type, crawler).info(f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
|
|
|
+ continue
|
|
|
+ for k, v in video_dict.items():
|
|
|
+ Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+
|
|
|
+ try:
|
|
|
+ # print(
|
|
|
+ # f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
|
|
|
+ cls.download_publish(
|
|
|
+ search_word=search_word,
|
|
|
+ log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ video_dict=video_dict,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ strategy=strategy,
|
|
|
+ our_uid=our_uid,
|
|
|
+ oss_endpoint=oss_endpoint,
|
|
|
+ env=env,
|
|
|
+ machine=machine
|
|
|
+ )
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f'视频:{item_id},下载失败,原因:{e}')
|
|
|
+ continue
|
|
|
+ total_count += 1
|
|
|
+ if total_count >= 30:
|
|
|
+ return
|
|
|
elif v_type == 'pseries':
|
|
|
try:
|
|
|
item_id = video_info['data']['group_id']
|
|
@@ -697,57 +720,53 @@ class Search:
|
|
|
p_res = requests.request("GET", p_url, headers=p_headers,
|
|
|
proxies=Common.tunnel_proxies()).json()
|
|
|
except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
|
|
|
+ Common.logger(log_type, crawler).error(f'合集:{item_id},没有获取到合集详情,原因:{e}')
|
|
|
continue
|
|
|
for video in p_res['data']:
|
|
|
item_id = video['item_id']
|
|
|
try:
|
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
- if not video_dict:
|
|
|
- continue
|
|
|
- if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
|
|
|
- Common.logger(log_type, crawler).info(f'gid:{item_id},不符合抓取规则\n')
|
|
|
- continue
|
|
|
- else:
|
|
|
- video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
|
- video_dict['video_width'] = video_url_dict["video_width"]
|
|
|
- video_dict['video_height'] = video_url_dict["video_height"]
|
|
|
- video_dict['audio_url'] = video_url_dict["audio_url"]
|
|
|
- video_dict['video_url'] = video_url_dict["video_url"]
|
|
|
- video_dict['session'] = signature
|
|
|
- break
|
|
|
+ video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
|
+ video_dict['video_width'] = video_url_dict["video_width"]
|
|
|
+ video_dict['video_height'] = video_url_dict["video_height"]
|
|
|
+ video_dict['audio_url'] = video_url_dict["audio_url"]
|
|
|
+ video_dict['video_url'] = video_url_dict["video_url"]
|
|
|
+ video_dict['session'] = signature
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
|
|
|
continue
|
|
|
- if not video_dict:
|
|
|
- continue
|
|
|
- if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
|
- Common.logger(log_type, crawler).info(f'gid:{video_dict["video_id"]},视频已下载,无需重复下载\n')
|
|
|
- continue
|
|
|
- for k, v in video_dict.items():
|
|
|
- Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
- try:
|
|
|
-
|
|
|
- cls.download_publish(
|
|
|
- search_word=search_word,
|
|
|
- log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- video_dict=video_dict,
|
|
|
- rule_dict=rule_dict,
|
|
|
- strategy=strategy,
|
|
|
- our_uid=our_uid,
|
|
|
- oss_endpoint=oss_endpoint,
|
|
|
- env=env,
|
|
|
- machine=machine
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
|
|
|
- continue
|
|
|
+ if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
|
+ Common.logger(log_type, crawler).info(
|
|
|
+ f'gid:{video_dict["gid"]},视频已下载,无需重复下载\n')
|
|
|
+ continue
|
|
|
+ if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
|
|
|
+ Common.logger(log_type, crawler).error(f'视频:{item_id},不符合抓取规则\n')
|
|
|
+ continue
|
|
|
+ for k, v in video_dict.items():
|
|
|
+ Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ try:
|
|
|
+ # print(
|
|
|
+ # f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
|
|
|
+ cls.download_publish(
|
|
|
+ search_word=search_word,
|
|
|
+ log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ video_dict=video_dict,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ strategy=strategy,
|
|
|
+ our_uid=our_uid,
|
|
|
+ oss_endpoint=oss_endpoint,
|
|
|
+ env=env,
|
|
|
+ machine=machine
|
|
|
+ )
|
|
|
+ total_count += 1
|
|
|
+ if total_count >= 30:
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
|
|
|
|
|
|
- total_count += 1
|
|
|
- # print(f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
|
|
|
- if total_count >= 30:
|
|
|
- return
|
|
|
offset += 10
|
|
|
|
|
|
@classmethod
|
|
@@ -887,7 +906,7 @@ class Search:
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
# print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
|
|
|
- # Search.get_search_videos('search', 'xigua', 'xigua_search', 'inner', 'prod', 'aliyun')
|
|
|
+ Search.get_search_videos('search', 'xigua', 'xigua_search', 'inner', 'prod', 'aliyun')
|
|
|
|
|
|
# Follow.get_videolist(log_type="follow",
|
|
|
# crawler="xigua",
|