|
@@ -0,0 +1,89 @@
|
|
|
+import os
|
|
|
+import shutil
|
|
|
+import json
|
|
|
+import datetime
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+import requests
|
|
|
+import multiprocessing
|
|
|
+from threading import Timer
|
|
|
+from utils import data_check, get_feature_data, asr_validity_discrimination
|
|
|
+from whisper_asr import get_whisper_asr
|
|
|
+from gpt_tag import request_gpt
|
|
|
+from config import set_config
|
|
|
+from log import Log
|
|
|
+config_ = set_config()
|
|
|
+log_ = Log()
|
|
|
+features = ['videoid', 'title', 'video_path']
|
|
|
+
|
|
|
+
|
|
|
+def asr_process(project, table, dt):
|
|
|
+ # 获取特征数据
|
|
|
+ feature_df = get_feature_data(project=project, table=table, dt=dt, features=features)
|
|
|
+ video_id_list = feature_df['videoid'].to_list()
|
|
|
+ video_info = {}
|
|
|
+ for video_id in video_id_list:
|
|
|
+ title = feature_df[feature_df['videoid'] == video_id]['title'].values[0]
|
|
|
+ if title is None:
|
|
|
+ continue
|
|
|
+ title = title.strip()
|
|
|
+ if len(title) > 0:
|
|
|
+ video_info[video_id] = {'title': title}
|
|
|
+ # 获取已下载视频,做asr识别
|
|
|
+ download_folder = 'videos'
|
|
|
+ asr_folder = 'asr_res'
|
|
|
+ if not os.path.exists(asr_folder):
|
|
|
+ os.makedirs(asr_folder)
|
|
|
+ retry = 0
|
|
|
+ while retry < 3:
|
|
|
+ video_folder_list = os.listdir(download_folder)
|
|
|
+ if len(video_folder_list) < 2:
|
|
|
+ retry += 1
|
|
|
+ time.sleep(60)
|
|
|
+ continue
|
|
|
+ for video_id in video_folder_list:
|
|
|
+ if video_id not in video_id_list:
|
|
|
+ continue
|
|
|
+ if video_info.get(video_id, None) is None:
|
|
|
+ shutil.rmtree(os.path.join(download_folder, video_id))
|
|
|
+ else:
|
|
|
+ video_folder = os.path.join(download_folder, video_id)
|
|
|
+ for filename in os.listdir(video_folder):
|
|
|
+ video_type = filename.split('.')[-1]
|
|
|
+ if video_type in ['mp4', 'm3u8']:
|
|
|
+ video_file = os.path.join(video_folder, filename)
|
|
|
+ # 1. asr识别
|
|
|
+ asr_res_initial = get_whisper_asr(video=video_file)
|
|
|
+ # 2. 识别结果写入文件
|
|
|
+ asr_path = os.path.join(asr_folder, f"{video_id}.txt")
|
|
|
+ with open(asr_path, 'w', encoding='utf-8') as wf:
|
|
|
+ wf.write(asr_res_initial)
|
|
|
+ # 将处理过的视频进行删除
|
|
|
+ shutil.rmtree(os.path.join(download_folder, video_id))
|
|
|
+ else:
|
|
|
+ shutil.rmtree(os.path.join(download_folder, video_id))
|
|
|
+
|
|
|
+def timer_check():
|
|
|
+ try:
|
|
|
+ project = config_.DAILY_VIDEO['project']
|
|
|
+ table = config_.DAILY_VIDEO['table']
|
|
|
+ now_date = datetime.datetime.today()
|
|
|
+ print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
|
|
|
+ dt = datetime.datetime.strftime(now_date-datetime.timedelta(days=1), '%Y%m%d')
|
|
|
+ # 查看数据是否已准备好
|
|
|
+ data_count = data_check(project=project, table=table, dt=dt)
|
|
|
+ if data_count > 0:
|
|
|
+ print(f'videos count = {data_count}')
|
|
|
+ # 数据准备好,进行视频下载
|
|
|
+ asr_process(project=project, table=table, dt=dt)
|
|
|
+ print(f"videos asr finished!")
|
|
|
+
|
|
|
+ else:
|
|
|
+ # 数据没准备好,1分钟后重新检查
|
|
|
+ Timer(60, timer_check).start()
|
|
|
+ except Exception as e:
|
|
|
+ print(f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ timer_check()
|