Parcourir la source

download_video

sunxy il y a 1 an
Parent
commit
ee6d6323a7
1 fichiers modifiés avec 37 ajouts et 6 suppressions
  1. 37 6
      asr_task.py

+ 37 - 6
asr_task.py

@@ -38,11 +38,13 @@ def get_asr(video_id, download_folder, asr_folder):
 
 def asr_process(project, table, dt, cuda_id):
     # 获取特征数据
-    feature_df = get_feature_data(project=project, table=table, dt=dt, features=features)
+    feature_df = get_feature_data(
+        project=project, table=table, dt=dt, features=features)
     video_id_list = feature_df['videoid'].to_list()
     video_info = {}
     for video_id in video_id_list:
-        title = feature_df[feature_df['videoid'] == video_id]['title'].values[0]
+        title = feature_df[feature_df['videoid']
+                           == video_id]['title'].values[0]
         if title is None:
             continue
         title = title.strip()
@@ -112,7 +114,8 @@ def timer_check():
         table = config_.DAILY_VIDEO['table']
         now_date = datetime.datetime.today()
         print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
-        dt = datetime.datetime.strftime(now_date-datetime.timedelta(days=1), '%Y%m%d')
+        dt = datetime.datetime.strftime(
+            now_date-datetime.timedelta(days=1), '%Y%m%d')
         # 查看数据是否已准备好
         data_count = data_check(project=project, table=table, dt=dt)
         if data_count > 0:
@@ -124,15 +127,43 @@ def timer_check():
                 Timer(60, timer_check).start()
             else:
                 # 数据准备好,进行asr
-                asr_process(project=project, table=table, dt=dt, cuda_id=cuda_id)
+                asr_process(project=project, table=table,
+                            dt=dt, cuda_id=cuda_id)
                 print(f"videos asr finished!")
 
         else:
             # 数据没准备好,1分钟后重新检查
             Timer(60, timer_check).start()
     except Exception as e:
-        print(f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
+        print(
+            f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
 
 
 if __name__ == '__main__':
-    timer_check()
+    # timer_check()
+    cuda_id = sys.argv[1]
+    download_folder = 'videos'
+    download_folder = f'{download_folder}_{cuda_id}'
+    if not os.path.exists(download_folder):
+        print(f"download_folder: {download_folder} not exists!")
+        exit(0)
+    # 遍历download_folder下所有的子文件夹名,即video_id list
+    video_folder_list = os.listdir(download_folder)
+    if len(video_folder_list) < 1:
+        print(f"video_folder_list is empty!")
+        exit(0)
+
+    asr_folder = 'asr_res'
+    if not os.path.exists(asr_folder):
+        os.makedirs(asr_folder)
+
+    pool = multiprocessing.Pool(processes=5)
+    for video_id in video_folder_list:
+        pool.apply_async(
+            func=get_asr,
+            args=(video_id, download_folder, asr_folder)
+        )
+    pool.close()
+    pool.join()
+
+    print(f"videos asr finished!")