asr_task.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import os
  2. import shutil
  3. import json
  4. import datetime
  5. import time
  6. import traceback
  7. import requests
  8. import multiprocessing
  9. from threading import Timer
  10. from utils import data_check, get_feature_data, asr_validity_discrimination
  11. from whisper_asr import get_whisper_asr
  12. from gpt_tag import request_gpt
  13. from config import set_config
  14. from log import Log
  15. config_ = set_config()
  16. log_ = Log()
  17. features = ['videoid', 'title', 'video_path']
  18. def asr_process(project, table, dt):
  19. # 获取特征数据
  20. feature_df = get_feature_data(project=project, table=table, dt=dt, features=features)
  21. video_id_list = feature_df['videoid'].to_list()
  22. video_info = {}
  23. for video_id in video_id_list:
  24. title = feature_df[feature_df['videoid'] == video_id]['title'].values[0]
  25. if title is None:
  26. continue
  27. title = title.strip()
  28. if len(title) > 0:
  29. video_info[video_id] = {'title': title}
  30. # 获取已下载视频,做asr识别
  31. download_folder = 'videos'
  32. asr_folder = 'asr_res'
  33. if not os.path.exists(asr_folder):
  34. os.makedirs(asr_folder)
  35. retry = 0
  36. while retry < 3:
  37. video_folder_list = os.listdir(download_folder)
  38. if len(video_folder_list) < 2:
  39. retry += 1
  40. time.sleep(60)
  41. continue
  42. for video_id in video_folder_list:
  43. if video_id not in video_id_list:
  44. continue
  45. if video_info.get(video_id, None) is None:
  46. shutil.rmtree(os.path.join(download_folder, video_id))
  47. else:
  48. video_folder = os.path.join(download_folder, video_id)
  49. for filename in os.listdir(video_folder):
  50. video_type = filename.split('.')[-1]
  51. if video_type in ['mp4', 'm3u8']:
  52. video_file = os.path.join(video_folder, filename)
  53. # 1. asr识别
  54. asr_res_initial = get_whisper_asr(video=video_file)
  55. print(video_id, asr_res_initial)
  56. # 2. 识别结果写入文件
  57. asr_path = os.path.join(asr_folder, f"{video_id}.txt")
  58. with open(asr_path, 'w', encoding='utf-8') as wf:
  59. wf.write(asr_res_initial)
  60. # 将处理过的视频进行删除
  61. shutil.rmtree(os.path.join(download_folder, video_id))
  62. else:
  63. shutil.rmtree(os.path.join(download_folder, video_id))
  64. def timer_check():
  65. try:
  66. project = config_.DAILY_VIDEO['project']
  67. table = config_.DAILY_VIDEO['table']
  68. now_date = datetime.datetime.today()
  69. print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
  70. dt = datetime.datetime.strftime(now_date-datetime.timedelta(days=1), '%Y%m%d')
  71. # 查看数据是否已准备好
  72. data_count = data_check(project=project, table=table, dt=dt)
  73. if data_count > 0:
  74. print(f'videos count = {data_count}')
  75. # 数据准备好,进行视频下载
  76. asr_process(project=project, table=table, dt=dt)
  77. print(f"videos asr finished!")
  78. else:
  79. # 数据没准备好,1分钟后重新检查
  80. Timer(60, timer_check).start()
  81. except Exception as e:
  82. print(f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
  83. if __name__ == '__main__':
  84. timer_check()