Ver Fonte

add asr task

liqian há 1 ano atrás
pai
commit
f955751c60
1 ficheiros alterados com 89 adições e 0 exclusões
  1. 89 0
      asr_task.py

+ 89 - 0
asr_task.py

@@ -0,0 +1,89 @@
+import os
+import shutil
+import json
+import datetime
+import time
+import traceback
+import requests
+import multiprocessing
+from threading import Timer
+from utils import data_check, get_feature_data, asr_validity_discrimination
+from whisper_asr import get_whisper_asr
+from gpt_tag import request_gpt
+from config import set_config
+from log import Log
+config_ = set_config()
+log_ = Log()
+features = ['videoid', 'title', 'video_path']
+
+
+def asr_process(project, table, dt):
+    # 获取特征数据
+    feature_df = get_feature_data(project=project, table=table, dt=dt, features=features)
+    video_id_list = feature_df['videoid'].to_list()
+    video_info = {}
+    for video_id in video_id_list:
+        title = feature_df[feature_df['videoid'] == video_id]['title'].values[0]
+        if title is None:
+            continue
+        title = title.strip()
+        if len(title) > 0:
+            video_info[video_id] = {'title': title}
+    # 获取已下载视频,做asr识别
+    download_folder = 'videos'
+    asr_folder = 'asr_res'
+    if not os.path.exists(asr_folder):
+        os.makedirs(asr_folder)
+    retry = 0
+    while retry < 3:
+        video_folder_list = os.listdir(download_folder)
+        if len(video_folder_list) < 2:
+            retry += 1
+            time.sleep(60)
+            continue
+        for video_id in video_folder_list:
+            if video_id not in video_id_list:
+                continue
+            if video_info.get(video_id, None) is None:
+                shutil.rmtree(os.path.join(download_folder, video_id))
+            else:
+                video_folder = os.path.join(download_folder, video_id)
+                for filename in os.listdir(video_folder):
+                    video_type = filename.split('.')[-1]
+                    if video_type in ['mp4', 'm3u8']:
+                        video_file = os.path.join(video_folder, filename)
+                        # 1. asr识别
+                        asr_res_initial = get_whisper_asr(video=video_file)
+                        # 2. 识别结果写入文件
+                        asr_path = os.path.join(asr_folder, f"{video_id}.txt")
+                        with open(asr_path, 'w', encoding='utf-8') as wf:
+                            wf.write(asr_res_initial)
+                        # 将处理过的视频进行删除
+                        shutil.rmtree(os.path.join(download_folder, video_id))
+                    else:
+                        shutil.rmtree(os.path.join(download_folder, video_id))
+
+def timer_check():
+    try:
+        project = config_.DAILY_VIDEO['project']
+        table = config_.DAILY_VIDEO['table']
+        now_date = datetime.datetime.today()
+        print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
+        dt = datetime.datetime.strftime(now_date-datetime.timedelta(days=1), '%Y%m%d')
+        # 查看数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=dt)
+        if data_count > 0:
+            print(f'videos count = {data_count}')
+            # 数据准备好,进行视频下载
+            asr_process(project=project, table=table, dt=dt)
+            print(f"videos asr finished!")
+
+        else:
+            # 数据没准备好,1分钟后重新检查
+            Timer(60, timer_check).start()
+    except Exception as e:
+        print(f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
+
+
+if __name__ == '__main__':
+    timer_check()