asr_task.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. import os
  2. import shutil
  3. import json
  4. import datetime
  5. import sys
  6. import time
  7. import traceback
  8. import requests
  9. import multiprocessing
  10. from threading import Timer
  11. from whisper_asr import get_whisper_asr
  12. from gpt_tag import request_gpt
  13. from config import set_config
  14. from log import Log
  15. config_ = set_config()
  16. log_ = Log()
  17. features = ['videoid', 'title', 'video_path']
  18. def get_asr(video_id, download_folder, asr_folder):
  19. video_folder = os.path.join(download_folder, video_id)
  20. for filename in os.listdir(video_folder):
  21. video_type = filename.split('.')[-1]
  22. if video_type in ['mp4', 'm3u8']:
  23. video_file = os.path.join(video_folder, filename)
  24. # 1. asr识别
  25. asr_res_initial = get_whisper_asr(video=video_file)
  26. print(video_id, asr_res_initial)
  27. # 2. 识别结果写入文件
  28. asr_path = os.path.join(asr_folder, f"{video_id}.txt")
  29. with open(asr_path, 'w', encoding='utf-8') as wf:
  30. wf.write(asr_res_initial)
  31. # 将处理过的视频进行删除
  32. shutil.rmtree(os.path.join(download_folder, video_id))
  33. break
  34. # def asr_process(project, table, dt, cuda_id):
  35. # # 获取特征数据
  36. # feature_df = get_feature_data(
  37. # project=project, table=table, dt=dt, features=features)
  38. # video_id_list = feature_df['videoid'].to_list()
  39. # video_info = {}
  40. # for video_id in video_id_list:
  41. # title = feature_df[feature_df['videoid']
  42. # == video_id]['title'].values[0]
  43. # if title is None:
  44. # continue
  45. # title = title.strip()
  46. # if len(title) > 0:
  47. # video_info[video_id] = {'title': title}
  48. # # 获取已下载视频,做asr识别
  49. # download_folder = 'videos'
  50. # download_folder = f'{download_folder}_{cuda_id}'
  51. # asr_folder = 'asr_res'
  52. # if not os.path.exists(asr_folder):
  53. # os.makedirs(asr_folder)
  54. # retry = 0
  55. # while retry < 3:
  56. # video_folder_list = os.listdir(download_folder)
  57. # if len(video_folder_list) < 1:
  58. # retry += 1
  59. # time.sleep(60)
  60. # continue
  61. # retry = 0
  62. # # for video_id in video_folder_list:
  63. # # if video_id not in video_id_list:
  64. # # continue
  65. # # if video_info.get(video_id, None) is None:
  66. # # try:
  67. # # shutil.rmtree(os.path.join(download_folder, video_id))
  68. # # except:
  69. # # continue
  70. # # else:
  71. # # video_folder = os.path.join(download_folder, video_id)
  72. # # for filename in os.listdir(video_folder):
  73. # # video_type = filename.split('.')[-1]
  74. # # if video_type in ['mp4', 'm3u8']:
  75. # # video_file = os.path.join(video_folder, filename)
  76. # # # 1. asr识别
  77. # # asr_res_initial = get_whisper_asr(video=video_file)
  78. # # print(video_id, asr_res_initial)
  79. # # # 2. 识别结果写入文件
  80. # # asr_path = os.path.join(asr_folder, f"{video_id}.txt")
  81. # # with open(asr_path, 'w', encoding='utf-8') as wf:
  82. # # wf.write(asr_res_initial)
  83. # # # 将处理过的视频进行删除
  84. # # shutil.rmtree(os.path.join(download_folder, video_id))
  85. # # break
  86. # pool = multiprocessing.Pool(processes=2)
  87. # for video_id in video_folder_list:
  88. # if video_id not in video_id_list:
  89. # continue
  90. # if video_info.get(video_id, None) is None:
  91. # try:
  92. # shutil.rmtree(os.path.join(download_folder, video_id))
  93. # except:
  94. # continue
  95. # else:
  96. # pool.apply_async(
  97. # func=get_asr,
  98. # args=(video_id, download_folder, asr_folder)
  99. # )
  100. # pool.close()
  101. # pool.join()
  102. # def timer_check():
  103. # try:
  104. # cuda_id = sys.argv[1]
  105. # project = config_.DAILY_VIDEO['project']
  106. # table = config_.DAILY_VIDEO['table']
  107. # now_date = datetime.datetime.today()
  108. # print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
  109. # dt = datetime.datetime.strftime(
  110. # now_date-datetime.timedelta(days=1), '%Y%m%d')
  111. # # 查看数据是否已准备好
  112. # data_count = data_check(project=project, table=table, dt=dt)
  113. # if data_count > 0:
  114. # print(f'videos count = {data_count}')
  115. # download_folder = 'videos'
  116. # download_folder = f'{download_folder}_{cuda_id}'
  117. # if not os.path.exists(download_folder):
  118. # # 视频未下载好,1分钟后重新检查
  119. # Timer(60, timer_check).start()
  120. # else:
  121. # # 数据准备好,进行asr
  122. # asr_process(project=project, table=table,
  123. # dt=dt, cuda_id=cuda_id)
  124. # print(f"videos asr finished!")
  125. # else:
  126. # # 数据没准备好,1分钟后重新检查
  127. # Timer(60, timer_check).start()
  128. # except Exception as e:
  129. # print(
  130. # f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
  131. if __name__ == '__main__':
  132. # timer_check()
  133. cuda_id = sys.argv[1]
  134. download_folder = 'videos'
  135. download_folder = f'{download_folder}_{cuda_id}'
  136. if not os.path.exists(download_folder):
  137. print(f"download_folder: {download_folder} not exists!")
  138. exit(0)
  139. # 遍历download_folder下所有的子文件夹名,即video_id list
  140. video_folder_list = os.listdir(download_folder)
  141. if len(video_folder_list) < 1:
  142. print(f"video_folder_list is empty!")
  143. exit(0)
  144. asr_folder = 'asr_res'
  145. if not os.path.exists(asr_folder):
  146. os.makedirs(asr_folder)
  147. pool = multiprocessing.Pool(processes=5)
  148. for video_id in video_folder_list:
  149. pool.apply_async(
  150. func=get_asr,
  151. args=(video_id, download_folder, asr_folder)
  152. )
  153. pool.close()
  154. pool.join()
  155. print(f"videos asr finished!")