sunxiaoyi 11 mesiacov pred
rodič
commit
2c079ba427
5 zmenil súbory, kde vykonal 73 pridanie a 4 odobranie
  1. 3 0
      ODPSQueryUtil.py
  2. 40 1
      ai_tag_task.py
  3. 28 1
      asr_task.py
  4. 1 2
      asr_task.sh
  5. 1 0
      requirements.txt

+ 3 - 0
ODPSQueryUtil.py

@@ -22,3 +22,6 @@ def query_videos(start_idx, limit):
             # 处理查询结果
             result.append(record)
     return result
+
+
+

+ 40 - 1
ai_tag_task.py

@@ -13,7 +13,9 @@ from whisper_asr import get_whisper_asr
 from gpt_tag import request_gpt
 from config import set_config
 from log import Log
+from ReadXlsxFile import getVideoInfoInXlxs
 import mysql_connect
+
 config_ = set_config()
 log_ = Log()
 features = ['videoid', 'title', 'video_path']
@@ -267,4 +269,41 @@ def timer_check():
 
 
 if __name__ == '__main__':
-    timer_check()
+    # timer_check()
+    feature_df = getVideoInfoInXlxs('past_videos.xlsx')
+    video_id_list = feature_df['videoid'].to_list()
+    video_info = {}
+    for video_id in video_id_list:
+        titleObj = feature_df[feature_df['videoid']
+                           == video_id]['title'].values[0]
+        video_path = feature_df[feature_df['videoid']
+                                == video_id]['video_path'].values[0]
+        title = str(titleObj)
+        if title is None:
+            continue
+        title = title.strip()
+        if len(title) > 0:
+            video_info[video_id] = {'title': title, 'video_path': video_path}
+            # print(video_id, title)
+    print(len(video_info))
+    # 获取已asr识别的视频
+    asr_folder = 'asr_res'
+    retry = 0
+    while retry < 30:
+        asr_file_list = os.listdir(asr_folder)
+        if len(asr_file_list) < 1:
+            retry += 1
+            time.sleep(60)
+            continue
+        retry = 0
+        for asr_filename in asr_file_list:
+            video_id = asr_filename[:-4]
+            if video_id not in video_id_list:
+                continue
+            asr_file = os.path.join(asr_folder, asr_filename)
+            if video_info.get(video_id, None) is None:
+                os.remove(asr_file)
+            else:
+                get_video_ai_tags(
+                    video_id=video_id, asr_file=asr_file, video_info=video_info.get(video_id))
+                os.remove(asr_file)

+ 28 - 1
asr_task.py

@@ -142,4 +142,31 @@ def timer_check():
 
 
 if __name__ == '__main__':
-    timer_check()
+   #  timer_check()
+    cuda_id = sys.argv[1]
+    download_folder = 'videos'
+    download_folder = f'{download_folder}_{cuda_id}'
+    if not os.path.exists(download_folder):
+        print(f"download_folder: {download_folder} not exists!")
+        exit(0)
+    # 遍历download_folder下所有的子文件夹名,即video_id list
+    video_folder_list = os.listdir(download_folder)
+    if len(video_folder_list) < 1:
+        print(f"video_folder_list is empty!")
+        exit(0)
+
+    asr_folder = 'asr_res'
+    if not os.path.exists(asr_folder):
+        os.makedirs(asr_folder)
+
+    pool = multiprocessing.Pool(processes=2)
+    for video_id in video_folder_list:
+        pool.apply_async(
+            func=get_asr,
+            args=(video_id, download_folder, asr_folder)
+        )
+    pool.close()
+    pool.join()
+
+    print(f"videos asr finished!")
+

+ 1 - 2
asr_task.sh

@@ -2,10 +2,9 @@ ps -ef | grep asr_task.py | grep -v grep | awk '{print $2}' | xargs kill -9
 
 rm -r asr_res/
 
-source activate srt
+# source activate srt
 
 nohup env CUDA_VISIBLE_DEVICES=0 python asr_task.py 0 > logs/asr_task_0.log 2>&1 &
 
 nohup env CUDA_VISIBLE_DEVICES=1 python asr_task.py 1 > logs/asr_task_1.log 2>&1 &
 
-conda deactivate

+ 1 - 0
requirements.txt

@@ -9,3 +9,4 @@ aliyun-log-python-sdk
 odps
 whisper
 mysql-connector-python
+openpyxl