|
@@ -676,31 +676,30 @@ class Follow:
|
|
|
|
|
|
@classmethod
|
|
|
def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine):
|
|
|
- # try:
|
|
|
- while True:
|
|
|
- feeds = cls.get_feeds(log_type, crawler, browse_id, out_uid)
|
|
|
- # Common.logger(log_type, crawler).info(f"feeds:{feeds}\n")
|
|
|
- for i in range(len(feeds)):
|
|
|
- if 'richItemRenderer' not in feeds[i]:
|
|
|
- Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]}\n')
|
|
|
- elif 'content' not in feeds[i]['richItemRenderer']:
|
|
|
- Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]}\n')
|
|
|
- elif 'videoRenderer' not in feeds[i]['richItemRenderer']['content']:
|
|
|
- Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]}\n')
|
|
|
- elif 'videoId' not in feeds[i]["richItemRenderer"]["content"]['videoRenderer']:
|
|
|
- Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]["videoRenderer"]}\n')
|
|
|
- else:
|
|
|
- video_id = feeds[i]["richItemRenderer"]["content"]['videoRenderer']['videoId']
|
|
|
- video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
|
|
|
- # 发布时间<=30天
|
|
|
- publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
|
|
|
- if int(time.time()) - publish_time <= 3600*24*30:
|
|
|
- cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine)
|
|
|
+ try:
|
|
|
+ while True:
|
|
|
+ feeds = cls.get_feeds(log_type, crawler, browse_id, out_uid)
|
|
|
+ for i in range(len(feeds)):
|
|
|
+ if 'richItemRenderer' not in feeds[i]:
|
|
|
+ Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]}\n')
|
|
|
+ elif 'content' not in feeds[i]['richItemRenderer']:
|
|
|
+ Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]}\n')
|
|
|
+ elif 'videoRenderer' not in feeds[i]['richItemRenderer']['content']:
|
|
|
+ Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]}\n')
|
|
|
+ elif 'videoId' not in feeds[i]["richItemRenderer"]["content"]['videoRenderer']:
|
|
|
+ Common.logger(log_type, crawler).warning(f'feeds:{feeds[i]["richItemRenderer"]["content"]["videoRenderer"]}\n')
|
|
|
else:
|
|
|
- Common.logger(log_type, crawler).info('发布时间超过30天\n')
|
|
|
- return
|
|
|
- # except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
|
|
|
+ video_id = feeds[i]["richItemRenderer"]["content"]['videoRenderer']['videoId']
|
|
|
+ video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
|
|
|
+ # 发布时间<=30天
|
|
|
+ publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
|
|
|
+ if int(time.time()) - publish_time <= 3600*24*30:
|
|
|
+ cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine)
|
|
|
+ else:
|
|
|
+ Common.logger(log_type, crawler).info('发布时间超过30天\n')
|
|
|
+ return
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
|
|
|
|
|
|
@classmethod
|
|
|
def get_video_info(cls, log_type, crawler, out_uid, video_id, machine):
|
|
@@ -881,7 +880,6 @@ class Follow:
|
|
|
'x-youtube-client-version': '2.20230201.01.00'
|
|
|
}
|
|
|
response = requests.post(url=url, headers=headers, data=payload)
|
|
|
- # Common.logger(log_type, crawler).info(f"get_video_info_response:{response.json()}\n")
|
|
|
if response.status_code != 200:
|
|
|
Common.logger(log_type, crawler).warning(f"get_video_info_response:{response.text}\n")
|
|
|
elif 'streamingData' not in response.json():
|
|
@@ -920,6 +918,14 @@ class Follow:
|
|
|
else:
|
|
|
publish_time = playerMicroformatRenderer['publishDate']
|
|
|
|
|
|
+ if publish_time == '':
|
|
|
+ publish_time_stamp = 0
|
|
|
+ elif ':' in publish_time:
|
|
|
+ publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")))
|
|
|
+ else:
|
|
|
+ publish_time_stamp = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
|
|
|
+
|
|
|
+
|
|
|
# user_name
|
|
|
if 'author' not in videoDetails:
|
|
|
user_name = ''
|
|
@@ -962,6 +968,7 @@ class Follow:
|
|
|
'duration': duration,
|
|
|
'play_cnt': play_cnt,
|
|
|
'publish_time': publish_time,
|
|
|
+ 'publish_time_stamp': publish_time_stamp,
|
|
|
'user_name': user_name,
|
|
|
'out_uid': out_uid,
|
|
|
'cover_url': cover_url,
|
|
@@ -973,141 +980,143 @@ class Follow:
|
|
|
|
|
|
@classmethod
|
|
|
def download_publish(cls, log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine):
|
|
|
- # try:
|
|
|
- sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
|
|
|
- repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
|
|
|
- if video_dict['video_title'] == '' or video_dict['video_url'] == '':
|
|
|
- Common.logger(log_type, crawler).info('无效视频\n')
|
|
|
- elif video_dict['duration'] > 600 or video_dict['duration'] < 60:
|
|
|
- Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
|
|
|
- elif repeat_video is not None and len(repeat_video) != 0:
|
|
|
- Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
- else:
|
|
|
- # 下载视频
|
|
|
- Common.logger(log_type, crawler).info('开始下载视频...')
|
|
|
- Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
|
|
|
- ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
|
|
|
- video_width = int(ffmpeg_dict['width'])
|
|
|
- video_height = int(ffmpeg_dict['height'])
|
|
|
- duration = int(ffmpeg_dict['duration'])
|
|
|
- video_size = int(ffmpeg_dict['size'])
|
|
|
-
|
|
|
- Common.logger(log_type, crawler).info(f'video_width:{video_width}')
|
|
|
- Common.logger(log_type, crawler).info(f'video_height:{video_height}')
|
|
|
- Common.logger(log_type, crawler).info(f'duration:{duration}')
|
|
|
- Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
|
|
|
-
|
|
|
- video_dict['video_width'] = video_width
|
|
|
- video_dict['video_height'] = video_height
|
|
|
- video_dict['duration'] = duration
|
|
|
- video_dict['comment_cnt'] = 0
|
|
|
- video_dict['like_cnt'] = 0
|
|
|
- video_dict['share_cnt'] = 0
|
|
|
- video_dict['avatar_url'] = video_dict['cover_url']
|
|
|
- video_dict['session'] = f'youtube{int(time.time())}'
|
|
|
- rule='1,2'
|
|
|
- if duration < 60 or duration > 600:
|
|
|
- # 删除视频文件夹
|
|
|
- shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
- Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
|
|
|
- return
|
|
|
- elif video_size == 0 or duration == 0 or video_size is None or duration is None:
|
|
|
- # 删除视频文件夹
|
|
|
- shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
- Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
|
|
|
- return
|
|
|
+ try:
|
|
|
+ sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_dict['video_id']}" """
|
|
|
+ repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
|
|
|
+ if video_dict['video_title'] == '' or video_dict['video_url'] == '':
|
|
|
+ Common.logger(log_type, crawler).info('无效视频\n')
|
|
|
+ elif video_dict['duration'] > 600 or video_dict['duration'] < 60:
|
|
|
+ Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
|
|
|
+ elif repeat_video is not None and len(repeat_video) != 0:
|
|
|
+ Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
+ elif video_dict['video_id'] in [x for y in Feishu.get_values_batch(log_type, crawler, 'GVxlYk') for x in y]:
|
|
|
+ Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
else:
|
|
|
- # 下载封面
|
|
|
- Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
|
|
|
- # 保存视频文本信息
|
|
|
- Common.save_video_info(log_type, crawler, video_dict)
|
|
|
+ # 下载视频
|
|
|
+ Common.logger(log_type, crawler).info('开始下载视频...')
|
|
|
+ Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
|
|
|
+ ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
|
|
|
+ video_width = int(ffmpeg_dict['width'])
|
|
|
+ video_height = int(ffmpeg_dict['height'])
|
|
|
+ duration = int(ffmpeg_dict['duration'])
|
|
|
+ video_size = int(ffmpeg_dict['size'])
|
|
|
+
|
|
|
+ Common.logger(log_type, crawler).info(f'video_width:{video_width}')
|
|
|
+ Common.logger(log_type, crawler).info(f'video_height:{video_height}')
|
|
|
+ Common.logger(log_type, crawler).info(f'duration:{duration}')
|
|
|
+ Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
|
|
|
|
|
|
- # 上传视频
|
|
|
- Common.logger(log_type, crawler).info(f"开始上传视频")
|
|
|
- if env == 'dev':
|
|
|
- our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
- our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
+ video_dict['video_width'] = video_width
|
|
|
+ video_dict['video_height'] = video_height
|
|
|
+ video_dict['duration'] = duration
|
|
|
+ video_dict['comment_cnt'] = 0
|
|
|
+ video_dict['like_cnt'] = 0
|
|
|
+ video_dict['share_cnt'] = 0
|
|
|
+ video_dict['avatar_url'] = video_dict['cover_url']
|
|
|
+ video_dict['session'] = f'youtube{int(time.time())}'
|
|
|
+ rule='1,2'
|
|
|
+ if duration < 60 or duration > 600:
|
|
|
+ # 删除视频文件夹
|
|
|
+ shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
+ Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
|
|
|
+ return
|
|
|
+ elif video_size == 0 or duration == 0 or video_size is None or duration is None:
|
|
|
+ # 删除视频文件夹
|
|
|
+ shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
|
|
|
+ Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")
|
|
|
+ return
|
|
|
else:
|
|
|
- our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
- our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
- Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
+ # 下载封面
|
|
|
+ Common.download_method(log_type, crawler, 'cover', video_dict['video_title'], video_dict['cover_url'])
|
|
|
+ # 保存视频文本信息
|
|
|
+ Common.save_video_info(log_type, crawler, video_dict)
|
|
|
+
|
|
|
+ # 上传视频
|
|
|
+ Common.logger(log_type, crawler).info(f"开始上传视频")
|
|
|
+ if env == 'dev':
|
|
|
+ our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
+ our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
+ else:
|
|
|
+ our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
|
|
|
+ our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
+ Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
|
|
|
- # 视频信息保存至飞书
|
|
|
- Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
|
|
|
- # 视频ID工作表,首行写入数据
|
|
|
- upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
|
|
|
- values = [[upload_time,
|
|
|
- "定向榜",
|
|
|
- video_dict['video_id'],
|
|
|
- video_dict['video_title'],
|
|
|
- our_video_link,
|
|
|
- video_dict['play_cnt'],
|
|
|
- video_dict['duration'],
|
|
|
- f'{video_width}*{video_height}',
|
|
|
- video_dict['publish_time'],
|
|
|
- video_dict['user_name'],
|
|
|
- video_dict['cover_url'],
|
|
|
- video_dict['video_url']
|
|
|
- ]]
|
|
|
- time.sleep(1)
|
|
|
- Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
|
|
|
- Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
|
|
|
+ # 视频信息保存至飞书
|
|
|
+ Feishu.insert_columns(log_type, crawler, "GVxlYk", "ROWS", 1, 2)
|
|
|
+ # 视频ID工作表,首行写入数据
|
|
|
+ upload_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
|
|
|
+ values = [[upload_time,
|
|
|
+ "定向榜",
|
|
|
+ video_dict['video_id'],
|
|
|
+ video_dict['video_title'],
|
|
|
+ our_video_link,
|
|
|
+ video_dict['play_cnt'],
|
|
|
+ video_dict['duration'],
|
|
|
+ f'{video_width}*{video_height}',
|
|
|
+ video_dict['publish_time'],
|
|
|
+ video_dict['user_name'],
|
|
|
+ video_dict['cover_url'],
|
|
|
+ video_dict['video_url']
|
|
|
+ ]]
|
|
|
+ time.sleep(1)
|
|
|
+ Feishu.update_values(log_type, crawler, "GVxlYk", "F2:Z2", values)
|
|
|
+ Common.logger(log_type, crawler).info('视频信息写入定向_已下载表成功\n')
|
|
|
|
|
|
- # 视频信息保存数据库
|
|
|
- sql = f""" insert into crawler_video(video_id,
|
|
|
- user_id,
|
|
|
- out_user_id,
|
|
|
- platform,
|
|
|
- strategy,
|
|
|
- out_video_id,
|
|
|
- video_title,
|
|
|
- cover_url,
|
|
|
- video_url,
|
|
|
- duration,
|
|
|
- publish_time,
|
|
|
- play_cnt,
|
|
|
- crawler_rule,
|
|
|
- width,
|
|
|
- height)
|
|
|
- values({our_video_id},
|
|
|
- "{our_uid}",
|
|
|
- "{video_dict['out_uid']}",
|
|
|
- "{cls.platform}",
|
|
|
- "定向爬虫策略",
|
|
|
- "{video_dict['video_id']}",
|
|
|
- "{video_dict['video_title']}",
|
|
|
- "{video_dict['cover_url']}",
|
|
|
- "{video_dict['video_url']}",
|
|
|
- {int(duration)},
|
|
|
- "{video_dict['publish_time']}",
|
|
|
- {int(video_dict['play_cnt'])},
|
|
|
- "{rule}",
|
|
|
- {int(video_width)},
|
|
|
- {int(video_height)}) """
|
|
|
- MysqlHelper.update_values(log_type, crawler, sql, env, machine)
|
|
|
- Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
|
|
|
- # except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
|
|
|
+ # 视频信息保存数据库
|
|
|
+ sql = f""" insert into crawler_video(video_id,
|
|
|
+ user_id,
|
|
|
+ out_user_id,
|
|
|
+ platform,
|
|
|
+ strategy,
|
|
|
+ out_video_id,
|
|
|
+ video_title,
|
|
|
+ cover_url,
|
|
|
+ video_url,
|
|
|
+ duration,
|
|
|
+ publish_time,
|
|
|
+ play_cnt,
|
|
|
+ crawler_rule,
|
|
|
+ width,
|
|
|
+ height)
|
|
|
+ values({our_video_id},
|
|
|
+ "{our_uid}",
|
|
|
+ "{video_dict['out_uid']}",
|
|
|
+ "{cls.platform}",
|
|
|
+ "定向爬虫策略",
|
|
|
+ "{video_dict['video_id']}",
|
|
|
+ "{video_dict['video_title']}",
|
|
|
+ "{video_dict['cover_url']}",
|
|
|
+ "{video_dict['video_url']}",
|
|
|
+ {int(duration)},
|
|
|
+ "{video_dict['publish_time']}",
|
|
|
+ {int(video_dict['play_cnt'])},
|
|
|
+ "{rule}",
|
|
|
+ {int(video_width)},
|
|
|
+ {int(video_height)}) """
|
|
|
+ MysqlHelper.update_values(log_type, crawler, sql, env, machine)
|
|
|
+ Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).info(f"download_publish异常:{e}\n")
|
|
|
|
|
|
@classmethod
|
|
|
def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
|
|
|
- # try:
|
|
|
- user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
|
|
|
- if len(user_list) == 0:
|
|
|
- Common.logger(log_type, crawler).warning('用户列表为空\n')
|
|
|
- else:
|
|
|
- for user_dict in user_list:
|
|
|
- out_uid = user_dict['out_user_id']
|
|
|
- user_name = user_dict['out_user_name']
|
|
|
- browse_id = user_dict['out_browse_id']
|
|
|
- our_uid = user_dict['our_user_id']
|
|
|
- Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
|
|
|
- cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine)
|
|
|
- Common.logger(log_type, crawler).info('休眠 10 秒')
|
|
|
- time.sleep(10)
|
|
|
- cls.continuation = ''
|
|
|
- # except Exception as e:
|
|
|
- # Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
|
|
|
+ try:
|
|
|
+ user_list = cls.get_user_from_feishu(log_type, crawler, 'c467d7', env, machine)
|
|
|
+ if len(user_list) == 0:
|
|
|
+ Common.logger(log_type, crawler).warning('用户列表为空\n')
|
|
|
+ else:
|
|
|
+ for user_dict in user_list:
|
|
|
+ out_uid = user_dict['out_user_id']
|
|
|
+ user_name = user_dict['out_user_name']
|
|
|
+ browse_id = user_dict['out_browse_id']
|
|
|
+ our_uid = user_dict['our_user_id']
|
|
|
+ Common.logger(log_type, crawler).info(f'获取 {user_name} 主页视频\n')
|
|
|
+ cls.get_videos(log_type, crawler, strategy, oss_endpoint, env, browse_id, out_uid, our_uid, machine)
|
|
|
+ Common.logger(log_type, crawler).info('休眠 10 秒')
|
|
|
+ time.sleep(10)
|
|
|
+ cls.continuation = ''
|
|
|
+ except Exception as e:
|
|
|
+ Common.logger(log_type, crawler).error(f"get_follow_videos异常:{e}\n")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|