Jelajahi Sumber

opt asr task

liqian 1 tahun lalu
induk
melakukan
53d4f16839
3 mengubah file dengan 10 tambahan dan 4 penghapusan
  1. 1 0
      .gitignore
  2. 5 2
      asr_task.py
  3. 4 2
      download_videos_task.py

+ 1 - 0
.gitignore

@@ -63,4 +63,5 @@ logs/
 .idea/
 .DS_Store
 asr_res/
+videos_*/
 

+ 5 - 2
asr_task.py

@@ -2,6 +2,7 @@ import os
 import shutil
 import json
 import datetime
+import sys
 import time
 import traceback
 import requests
@@ -35,7 +36,7 @@ def get_asr(video_id, download_folder, asr_folder):
             break
 
 
-def asr_process(project, table, dt):
+def asr_process(project, table, dt, cuda_id):
     # 获取特征数据
     feature_df = get_feature_data(project=project, table=table, dt=dt, features=features)
     video_id_list = feature_df['videoid'].to_list()
@@ -49,6 +50,7 @@ def asr_process(project, table, dt):
             video_info[video_id] = {'title': title}
     # 获取已下载视频,做asr识别
     download_folder = 'videos'
+    download_folder = f'{download_folder}_{cuda_id}'
     asr_folder = 'asr_res'
     if not os.path.exists(asr_folder):
         os.makedirs(asr_folder)
@@ -105,6 +107,7 @@ def asr_process(project, table, dt):
 
 def timer_check():
     try:
+        cuda_id = sys.argv[1]
         project = config_.DAILY_VIDEO['project']
         table = config_.DAILY_VIDEO['table']
         now_date = datetime.datetime.today()
@@ -115,7 +118,7 @@ def timer_check():
         if data_count > 0:
             print(f'videos count = {data_count}')
             # 数据准备好,进行视频下载
-            asr_process(project=project, table=table, dt=dt)
+            asr_process(project=project, table=table, dt=dt, cuda_id=cuda_id)
             print(f"videos asr finished!")
 
         else:

+ 4 - 2
download_videos_task.py

@@ -15,7 +15,9 @@ features = ['videoid', 'title', 'video_path']
 def download_video_from_oss(video_id, video_path, download_folder):
     """从oss下载视频"""
     try:
+        pid = int(os.getpid() % 2)
         print(f"{video_id} download start ...")
+        download_folder = f"{download_folder}_{pid}"
         if not os.path.exists(download_folder):
             os.makedirs(download_folder)
         video_local_dir = os.path.join(download_folder, video_id)
@@ -56,8 +58,8 @@ def download_videos(project, table, dt):
     feature_df = get_feature_data(project=project, table=table, dt=dt, features=features)
     download_folder = 'videos'
     video_id_list = feature_df['videoid'].to_list()
-    pool = multiprocessing.Pool(processes=5)
-    for video_id in video_id_list:
+    pool = multiprocessing.Pool(processes=6)
+    for video_id in video_id_list[:10]:
         video_path = feature_df[feature_df['videoid'] == video_id]['video_path'].values[0].strip()
         video_path = video_path.replace(' ', '')
         print(video_id, video_path)