|
@@ -651,6 +651,7 @@ class Search:
|
|
|
res = requests.request("GET", url, headers=headers, proxies=Common.tunnel_proxies())
|
|
|
search_list = res.json()['data']['data']
|
|
|
except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f'关键词:{search_word},没有获取到视频列表:offset{offset}')
|
|
|
search_list = []
|
|
|
if not search_list:
|
|
|
return
|
|
@@ -702,7 +703,10 @@ class Search:
|
|
|
video_dict = cls.get_video_info(log_type, crawler, item_id)
|
|
|
if not video_dict:
|
|
|
continue
|
|
|
- if cls.is_ruled(log_type, crawler, video_dict, rule_dict):
|
|
|
+ if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
|
|
|
+ Common.logger(log_type, crawler).info(f'gid:{item_id},不符合抓取规则\n')
|
|
|
+ continue
|
|
|
+ else:
|
|
|
video_url_dict = cls.get_video_url(log_type, crawler, video_dict['gid'])
|
|
|
video_dict['video_width'] = video_url_dict["video_width"]
|
|
|
video_dict['video_height'] = video_url_dict["video_height"]
|
|
@@ -710,21 +714,20 @@ class Search:
|
|
|
video_dict['video_url'] = video_url_dict["video_url"]
|
|
|
video_dict['session'] = signature
|
|
|
break
|
|
|
- else:
|
|
|
- continue
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).error(f'视频:{item_id},没有获取到视频详情,原因:{e}')
|
|
|
continue
|
|
|
|
|
|
if not cls.is_ruled(log_type, crawler, video_dict, rule_dict):
|
|
|
+ Common.logger(log_type, crawler).info(f'gid:{item_id},不符合抓取规则\n')
|
|
|
+ continue
|
|
|
+ if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
|
+ Common.logger(log_type, crawler).info(f'gid:{item_id},视频已下载,无需重复下载\n')
|
|
|
continue
|
|
|
for k, v in video_dict.items():
|
|
|
Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
- # print(f'title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
|
|
|
try:
|
|
|
- if cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
|
|
|
- Common.logger(log_type, crawler).info(f'gid:{item_id},视频已下载,无需重复下载\n')
|
|
|
- continue
|
|
|
+
|
|
|
cls.download_publish(
|
|
|
search_word=search_word,
|
|
|
log_type=log_type,
|
|
@@ -738,8 +741,11 @@ class Search:
|
|
|
machine=machine
|
|
|
)
|
|
|
except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f'视频:{item_id},download_publish异常:{e}\n')
|
|
|
continue
|
|
|
+
|
|
|
total_count += 1
|
|
|
+ # print(f'search_word:{search_word},title:{video_dict["video_title"]},gid:{video_dict["gid"]},offset:{offset}, total:{total_count}')
|
|
|
if total_count >= 30:
|
|
|
return
|
|
|
offset += 10
|
|
@@ -754,112 +760,109 @@ class Search:
|
|
|
@classmethod
|
|
|
def download_publish(cls, log_type, crawler, search_word, strategy, video_dict, rule_dict, our_uid, oss_endpoint,
|
|
|
env, machine):
|
|
|
- try:
|
|
|
|
|
|
- Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
|
|
|
- title=video_dict['video_title'], url=video_dict['video_url'])
|
|
|
- # 下载音频
|
|
|
- Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio',
|
|
|
- title=video_dict['video_title'], url=video_dict['audio_url'])
|
|
|
- # 合成音视频
|
|
|
- Common.video_compose(log_type=log_type, crawler=crawler,
|
|
|
- video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
- ffmpeg_dict = Common.ffmpeg(log_type, crawler,
|
|
|
- f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
|
|
|
- if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
|
|
|
- Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
|
|
|
- # 删除视频文件夹
|
|
|
- shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
- return
|
|
|
- # 下载封面
|
|
|
- Common.download_method(log_type=log_type, crawler=crawler, text='cover',
|
|
|
- title=video_dict['video_title'], url=video_dict['cover_url'])
|
|
|
- # 保存视频信息至txt
|
|
|
- Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
|
|
|
-
|
|
|
- # 上传视频
|
|
|
- Common.logger(log_type, crawler).info("开始上传视频...")
|
|
|
- our_video_id = Publish.upload_and_publish(log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- strategy=strategy,
|
|
|
- our_uid=our_uid,
|
|
|
- env=env,
|
|
|
- oss_endpoint=oss_endpoint)
|
|
|
- if env == 'dev':
|
|
|
- our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
- else:
|
|
|
- our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
- Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
+ Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
|
|
|
+ title=video_dict['video_title'], url=video_dict['video_url'])
|
|
|
+ # 下载音频
|
|
|
+ Common.download_method(log_type=log_type, crawler=crawler, text='xigua_audio',
|
|
|
+ title=video_dict['video_title'], url=video_dict['audio_url'])
|
|
|
+ # 合成音视频
|
|
|
+ Common.video_compose(log_type=log_type, crawler=crawler,
|
|
|
+ video_dir=f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
+ ffmpeg_dict = Common.ffmpeg(log_type, crawler,
|
|
|
+ f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
|
|
|
+ if ffmpeg_dict is None or ffmpeg_dict['size'] == 0:
|
|
|
+ Common.logger(log_type, crawler).warning(f"下载的视频无效,已删除\n")
|
|
|
+ # 删除视频文件夹
|
|
|
+ shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
+ return
|
|
|
+ # 下载封面
|
|
|
+ Common.download_method(log_type=log_type, crawler=crawler, text='cover',
|
|
|
+ title=video_dict['video_title'], url=video_dict['cover_url'])
|
|
|
+ # 保存视频信息至txt
|
|
|
+ Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
|
|
|
+
|
|
|
+ # 上传视频
|
|
|
+ Common.logger(log_type, crawler).info("开始上传视频...")
|
|
|
+ our_video_id = Publish.upload_and_publish(log_type=log_type,
|
|
|
+ crawler=crawler,
|
|
|
+ strategy=strategy,
|
|
|
+ our_uid=our_uid,
|
|
|
+ env=env,
|
|
|
+ oss_endpoint=oss_endpoint)
|
|
|
+ if env == 'dev':
|
|
|
+ our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
+ else:
|
|
|
+ our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
+ Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
|
|
|
- if our_video_id is None:
|
|
|
- # 删除视频文件夹
|
|
|
- shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
- return
|
|
|
+ if our_video_id is None:
|
|
|
+ # 删除视频文件夹
|
|
|
+ shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
|
|
|
+ return
|
|
|
|
|
|
- # 视频写入飞书
|
|
|
- Feishu.insert_columns(log_type, 'xigua', "BUNvGC", "ROWS", 1, 2)
|
|
|
- upload_time = int(time.time())
|
|
|
- values = [[
|
|
|
- search_word,
|
|
|
- time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
|
|
|
- "关键词搜索",
|
|
|
- video_dict['video_title'],
|
|
|
- str(video_dict['video_id']),
|
|
|
- our_video_link,
|
|
|
- video_dict['gid'],
|
|
|
- video_dict['play_cnt'],
|
|
|
- video_dict['comment_cnt'],
|
|
|
- video_dict['like_cnt'],
|
|
|
- video_dict['share_cnt'],
|
|
|
- video_dict['duration'],
|
|
|
- str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
|
|
|
- video_dict['publish_time_str'],
|
|
|
- video_dict['user_name'],
|
|
|
- video_dict['user_id'],
|
|
|
- video_dict['avatar_url'],
|
|
|
- video_dict['cover_url'],
|
|
|
- video_dict['video_url'],
|
|
|
- video_dict['audio_url']]]
|
|
|
- time.sleep(1)
|
|
|
- Feishu.update_values(log_type, 'xigua', "BUNvGC", "E2:Z2", values)
|
|
|
- Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
|
|
|
-
|
|
|
- # 视频信息保存数据库
|
|
|
- insert_sql = f""" insert into crawler_video(video_id,
|
|
|
- user_id,
|
|
|
- out_user_id,
|
|
|
- platform,
|
|
|
- strategy,
|
|
|
- out_video_id,
|
|
|
- video_title,
|
|
|
- cover_url,
|
|
|
- video_url,
|
|
|
- duration,
|
|
|
- publish_time,
|
|
|
- play_cnt,
|
|
|
- crawler_rule,
|
|
|
- width,
|
|
|
- height)
|
|
|
- values({our_video_id},
|
|
|
- {our_uid},
|
|
|
- "{video_dict['user_id']}",
|
|
|
- "{cls.platform}",
|
|
|
- "定向爬虫策略",
|
|
|
- "{video_dict['video_id']}",
|
|
|
- "{video_dict['video_title']}",
|
|
|
- "{video_dict['cover_url']}",
|
|
|
- "{video_dict['video_url']}",
|
|
|
- {int(video_dict['duration'])},
|
|
|
- "{video_dict['publish_time_str']}",
|
|
|
- {int(video_dict['play_cnt'])},
|
|
|
- '{json.dumps(rule_dict)}',
|
|
|
- {int(video_dict['video_width'])},
|
|
|
- {int(video_dict['video_height'])}) """
|
|
|
- Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
|
|
|
- MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
|
|
|
- Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
|
|
|
- except Exception as e:
|
|
|
- Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
|
|
|
+ # 视频写入飞书
|
|
|
+ Feishu.insert_columns(log_type, 'xigua', "BUNvGC", "ROWS", 1, 2)
|
|
|
+ upload_time = int(time.time())
|
|
|
+ values = [[
|
|
|
+ search_word,
|
|
|
+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
|
|
|
+ "关键词搜索",
|
|
|
+ video_dict['video_title'],
|
|
|
+ str(video_dict['video_id']),
|
|
|
+ our_video_link,
|
|
|
+ video_dict['gid'],
|
|
|
+ video_dict['play_cnt'],
|
|
|
+ video_dict['comment_cnt'],
|
|
|
+ video_dict['like_cnt'],
|
|
|
+ video_dict['share_cnt'],
|
|
|
+ video_dict['duration'],
|
|
|
+ str(video_dict['video_width']) + '*' + str(video_dict['video_height']),
|
|
|
+ video_dict['publish_time_str'],
|
|
|
+ video_dict['user_name'],
|
|
|
+ video_dict['user_id'],
|
|
|
+ video_dict['avatar_url'],
|
|
|
+ video_dict['cover_url'],
|
|
|
+ video_dict['video_url'],
|
|
|
+ video_dict['audio_url']]]
|
|
|
+ time.sleep(1)
|
|
|
+ Feishu.update_values(log_type, 'xigua', "BUNvGC", "E2:Z2", values)
|
|
|
+ Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
|
|
|
+
|
|
|
+ # 视频信息保存数据库
|
|
|
+ insert_sql = f""" insert into crawler_video(video_id,
|
|
|
+ user_id,
|
|
|
+ out_user_id,
|
|
|
+ platform,
|
|
|
+ strategy,
|
|
|
+ out_video_id,
|
|
|
+ video_title,
|
|
|
+ cover_url,
|
|
|
+ video_url,
|
|
|
+ duration,
|
|
|
+ publish_time,
|
|
|
+ play_cnt,
|
|
|
+ crawler_rule,
|
|
|
+ width,
|
|
|
+ height)
|
|
|
+ values({our_video_id},
|
|
|
+ {our_uid},
|
|
|
+ "{video_dict['user_id']}",
|
|
|
+ "{cls.platform}",
|
|
|
+ "定向爬虫策略",
|
|
|
+ "{video_dict['video_id']}",
|
|
|
+ "{video_dict['video_title']}",
|
|
|
+ "{video_dict['cover_url']}",
|
|
|
+ "{video_dict['video_url']}",
|
|
|
+ {int(video_dict['duration'])},
|
|
|
+ "{video_dict['publish_time_str']}",
|
|
|
+ {int(video_dict['play_cnt'])},
|
|
|
+ '{json.dumps(rule_dict)}',
|
|
|
+ {int(video_dict['video_width'])},
|
|
|
+ {int(video_dict['video_height'])}) """
|
|
|
+ Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
|
|
|
+ MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
|
|
|
+ Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
|
|
|
|
|
|
@classmethod
|
|
|
def get_search_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
|