|
@@ -38,11 +38,13 @@ def get_asr(video_id, download_folder, asr_folder):
|
|
|
|
|
|
def asr_process(project, table, dt, cuda_id):
|
|
|
# 获取特征数据
|
|
|
- feature_df = get_feature_data(project=project, table=table, dt=dt, features=features)
|
|
|
+ feature_df = get_feature_data(
|
|
|
+ project=project, table=table, dt=dt, features=features)
|
|
|
video_id_list = feature_df['videoid'].to_list()
|
|
|
video_info = {}
|
|
|
for video_id in video_id_list:
|
|
|
- title = feature_df[feature_df['videoid'] == video_id]['title'].values[0]
|
|
|
+ title = feature_df[feature_df['videoid']
|
|
|
+ == video_id]['title'].values[0]
|
|
|
if title is None:
|
|
|
continue
|
|
|
title = title.strip()
|
|
@@ -112,7 +114,8 @@ def timer_check():
|
|
|
table = config_.DAILY_VIDEO['table']
|
|
|
now_date = datetime.datetime.today()
|
|
|
print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
|
|
|
- dt = datetime.datetime.strftime(now_date-datetime.timedelta(days=1), '%Y%m%d')
|
|
|
+ dt = datetime.datetime.strftime(
|
|
|
+ now_date-datetime.timedelta(days=1), '%Y%m%d')
|
|
|
# 查看数据是否已准备好
|
|
|
data_count = data_check(project=project, table=table, dt=dt)
|
|
|
if data_count > 0:
|
|
@@ -124,15 +127,43 @@ def timer_check():
|
|
|
Timer(60, timer_check).start()
|
|
|
else:
|
|
|
# 数据准备好,进行asr
|
|
|
- asr_process(project=project, table=table, dt=dt, cuda_id=cuda_id)
|
|
|
+ asr_process(project=project, table=table,
|
|
|
+ dt=dt, cuda_id=cuda_id)
|
|
|
print(f"videos asr finished!")
|
|
|
|
|
|
else:
|
|
|
# 数据没准备好,1分钟后重新检查
|
|
|
Timer(60, timer_check).start()
|
|
|
except Exception as e:
|
|
|
- print(f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
|
|
|
+ print(
|
|
|
+ f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- timer_check()
|
|
|
+ # timer_check()
|
|
|
+ cuda_id = sys.argv[1]
|
|
|
+ download_folder = 'videos'
|
|
|
+ download_folder = f'{download_folder}_{cuda_id}'
|
|
|
+ if not os.path.exists(download_folder):
|
|
|
+ print(f"download_folder: {download_folder} not exists!")
|
|
|
+ exit(0)
|
|
|
+ # 遍历download_folder下所有的子文件夹名,即video_id list
|
|
|
+ video_folder_list = os.listdir(download_folder)
|
|
|
+ if len(video_folder_list) < 1:
|
|
|
+ print(f"video_folder_list is empty!")
|
|
|
+ exit(0)
|
|
|
+
|
|
|
+ asr_folder = 'asr_res'
|
|
|
+ if not os.path.exists(asr_folder):
|
|
|
+ os.makedirs(asr_folder)
|
|
|
+
|
|
|
+ pool = multiprocessing.Pool(processes=5)
|
|
|
+ for video_id in video_folder_list:
|
|
|
+ pool.apply_async(
|
|
|
+ func=get_asr,
|
|
|
+ args=(video_id, download_folder, asr_folder)
|
|
|
+ )
|
|
|
+ pool.close()
|
|
|
+ pool.join()
|
|
|
+
|
|
|
+ print(f"videos asr finished!")
|