asr_task.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. import os
  2. import shutil
  3. import json
  4. import datetime
  5. import sys
  6. import time
  7. import traceback
  8. import requests
  9. import multiprocessing
  10. from threading import Timer
  11. from utils import data_check, get_feature_data, asr_validity_discrimination
  12. from whisper_asr import get_whisper_asr
  13. from gpt_tag import request_gpt
  14. from config import set_config
  15. from log import Log
  16. config_ = set_config()
  17. log_ = Log()
  18. features = ['videoid', 'title', 'video_path']
  19. def get_asr(video_id, download_folder, asr_folder):
  20. video_folder = os.path.join(download_folder, video_id)
  21. for filename in os.listdir(video_folder):
  22. video_type = filename.split('.')[-1]
  23. if video_type in ['mp4', 'm3u8']:
  24. video_file = os.path.join(video_folder, filename)
  25. # 1. asr识别
  26. asr_res_initial = get_whisper_asr(video=video_file)
  27. print(video_id, asr_res_initial)
  28. # 2. 识别结果写入文件
  29. asr_path = os.path.join(asr_folder, f"{video_id}.txt")
  30. with open(asr_path, 'w', encoding='utf-8') as wf:
  31. wf.write(asr_res_initial)
  32. # 将处理过的视频进行删除
  33. shutil.rmtree(os.path.join(download_folder, video_id))
  34. break
  35. def asr_process(project, table, dt, cuda_id):
  36. # 获取特征数据
  37. feature_df = get_feature_data(project=project, table=table, dt=dt, features=features)
  38. video_id_list = feature_df['videoid'].to_list()
  39. video_info = {}
  40. for video_id in video_id_list:
  41. title = feature_df[feature_df['videoid'] == video_id]['title'].values[0]
  42. if title is None:
  43. continue
  44. title = title.strip()
  45. if len(title) > 0:
  46. video_info[video_id] = {'title': title}
  47. # 获取已下载视频,做asr识别
  48. download_folder = 'videos'
  49. download_folder = f'{download_folder}_{cuda_id}'
  50. asr_folder = 'asr_res'
  51. if not os.path.exists(asr_folder):
  52. os.makedirs(asr_folder)
  53. retry = 0
  54. while retry < 3:
  55. video_folder_list = os.listdir(download_folder)
  56. if len(video_folder_list) < 2:
  57. retry += 1
  58. time.sleep(60)
  59. continue
  60. retry = 0
  61. # for video_id in video_folder_list:
  62. # if video_id not in video_id_list:
  63. # continue
  64. # if video_info.get(video_id, None) is None:
  65. # try:
  66. # shutil.rmtree(os.path.join(download_folder, video_id))
  67. # except:
  68. # continue
  69. # else:
  70. # video_folder = os.path.join(download_folder, video_id)
  71. # for filename in os.listdir(video_folder):
  72. # video_type = filename.split('.')[-1]
  73. # if video_type in ['mp4', 'm3u8']:
  74. # video_file = os.path.join(video_folder, filename)
  75. # # 1. asr识别
  76. # asr_res_initial = get_whisper_asr(video=video_file)
  77. # print(video_id, asr_res_initial)
  78. # # 2. 识别结果写入文件
  79. # asr_path = os.path.join(asr_folder, f"{video_id}.txt")
  80. # with open(asr_path, 'w', encoding='utf-8') as wf:
  81. # wf.write(asr_res_initial)
  82. # # 将处理过的视频进行删除
  83. # shutil.rmtree(os.path.join(download_folder, video_id))
  84. # break
  85. pool = multiprocessing.Pool(processes=2)
  86. for video_id in video_folder_list:
  87. if video_id not in video_id_list:
  88. continue
  89. if video_info.get(video_id, None) is None:
  90. try:
  91. shutil.rmtree(os.path.join(download_folder, video_id))
  92. except:
  93. continue
  94. else:
  95. pool.apply_async(
  96. func=get_asr,
  97. args=(video_id, download_folder, asr_folder)
  98. )
  99. pool.close()
  100. pool.join()
  101. def timer_check():
  102. try:
  103. cuda_id = sys.argv[1]
  104. project = config_.DAILY_VIDEO['project']
  105. table = config_.DAILY_VIDEO['table']
  106. now_date = datetime.datetime.today()
  107. print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
  108. dt = datetime.datetime.strftime(now_date-datetime.timedelta(days=1), '%Y%m%d')
  109. # 查看数据是否已准备好
  110. data_count = data_check(project=project, table=table, dt=dt)
  111. if data_count > 0:
  112. print(f'videos count = {data_count}')
  113. # 数据准备好,进行视频下载
  114. asr_process(project=project, table=table, dt=dt, cuda_id=cuda_id)
  115. print(f"videos asr finished!")
  116. else:
  117. # 数据没准备好,1分钟后重新检查
  118. Timer(60, timer_check).start()
  119. except Exception as e:
  120. print(f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
  121. if __name__ == '__main__':
  122. timer_check()