import os import shutil import json import datetime import time import traceback import requests import multiprocessing from threading import Timer from utils import data_check, get_feature_data, asr_validity_discrimination from whisper_asr import get_whisper_asr from gpt_tag import request_gpt from config import set_config from log import Log config_ = set_config() log_ = Log() features = ['videoid', 'title', 'video_path'] def asr_process(project, table, dt): # 获取特征数据 feature_df = get_feature_data(project=project, table=table, dt=dt, features=features) video_id_list = feature_df['videoid'].to_list() video_info = {} for video_id in video_id_list: title = feature_df[feature_df['videoid'] == video_id]['title'].values[0] if title is None: continue title = title.strip() if len(title) > 0: video_info[video_id] = {'title': title} # 获取已下载视频,做asr识别 download_folder = 'videos' asr_folder = 'asr_res' if not os.path.exists(asr_folder): os.makedirs(asr_folder) retry = 0 while retry < 3: video_folder_list = os.listdir(download_folder) if len(video_folder_list) < 2: retry += 1 time.sleep(60) continue for video_id in video_folder_list: if video_id not in video_id_list: continue if video_info.get(video_id, None) is None: try: shutil.rmtree(os.path.join(download_folder, video_id)) except: continue else: video_folder = os.path.join(download_folder, video_id) for filename in os.listdir(video_folder): video_type = filename.split('.')[-1] if video_type in ['mp4', 'm3u8']: video_file = os.path.join(video_folder, filename) # 1. asr识别 asr_res_initial = get_whisper_asr(video=video_file) print(video_id, asr_res_initial) # 2. 识别结果写入文件 asr_path = os.path.join(asr_folder, f"{video_id}.txt") with open(asr_path, 'w', encoding='utf-8') as wf: wf.write(asr_res_initial) # 将处理过的视频进行删除 shutil.rmtree(os.path.join(download_folder, video_id)) break def timer_check(): try: project = config_.DAILY_VIDEO['project'] table = config_.DAILY_VIDEO['table'] now_date = datetime.datetime.today() print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}") dt = datetime.datetime.strftime(now_date-datetime.timedelta(days=1), '%Y%m%d') # 查看数据是否已准备好 data_count = data_check(project=project, table=table, dt=dt) if data_count > 0: print(f'videos count = {data_count}') # 数据准备好,进行视频下载 asr_process(project=project, table=table, dt=dt) print(f"videos asr finished!") else: # 数据没准备好,1分钟后重新检查 Timer(60, timer_check).start() except Exception as e: print(f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}") if __name__ == '__main__': timer_check()