main_process.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. import json
  2. import traceback
  3. from feishu import FeiShuHelper
  4. from audio_process import get_wav
  5. from xunfei_asr import call_asr
  6. from utils import download_video, asr_validity_discrimination
  7. from gpt_tag import get_tag, request_gpt
  8. from config import set_config
  9. from log import Log
  10. config_ = set_config()
  11. log_ = Log()
  12. def main(sheet_info_config):
  13. video_spreadsheet_token = sheet_info_config['video_spreadsheet_token']
  14. video_sheet_id = sheet_info_config['video_sheet_id']
  15. read_start_row = sheet_info_config['read_start_row']
  16. res_spreadsheet_token = sheet_info_config['res_spreadsheet_token']
  17. res_sheet_id = sheet_info_config['res_sheet_id']
  18. write_start_row = sheet_info_config['write_start_row']
  19. write_start_col = sheet_info_config['write_start_col']
  20. write_end_col = sheet_info_config['write_end_col']
  21. # 1. 读取飞书表格,获取视频url和videoId
  22. feishu_helper = FeiShuHelper()
  23. data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id)
  24. videos = []
  25. for item in data[read_start_row:]:
  26. print(item)
  27. try:
  28. videos.append(
  29. {
  30. 'videoId': item[0],
  31. 'title': item[1],
  32. 'videoPath': item[3][0]['text'],
  33. }
  34. )
  35. except:
  36. continue
  37. log_.info(f"videos count: {len(videos)}")
  38. result = []
  39. for i, video in enumerate(videos):
  40. try:
  41. log_.info(f"i = {i}, video = {video}")
  42. # 1. 下载视频
  43. video_id = video['videoId']
  44. video_path = video['videoPath']
  45. if video_path[-4:] != '.mp4':
  46. result = [[video_id, video_path, video['title'], '', '', '', '', '']]
  47. log_.info(f"result = {result}")
  48. if len(result) > 0:
  49. feishu_helper.data_to_feishu_sheet(
  50. sheet_token=res_spreadsheet_token,
  51. sheet_id=res_sheet_id,
  52. data=result,
  53. start_row=write_start_row,
  54. start_column=write_start_col,
  55. end_column=write_end_col
  56. )
  57. log_.info(f"write to feishu success!")
  58. write_start_row += 1
  59. else:
  60. try:
  61. video_file = download_video(video_path=video_path, video_id=video_id, download_folder='videos')
  62. # print(video_file)
  63. log_.info(f"video_path = {video_file}")
  64. # 2. 获取视频中的音频
  65. audio_path = get_wav(video_path=video_file)
  66. # print(audio_path)
  67. log_.info(f"audio_path = {audio_path}")
  68. # 3. asr
  69. dialogue_path, asr_res_initial = call_asr(audio_path=audio_path)
  70. # print(asr_res)
  71. log_.info(f"asr_res_initial = {asr_res_initial}")
  72. except:
  73. log_.error(traceback.format_exc())
  74. result = [[video_id, video_path, video['title'], '', '', '', '', '']]
  75. log_.info(f"result = {result}")
  76. if len(result) > 0:
  77. feishu_helper.data_to_feishu_sheet(
  78. sheet_token=res_spreadsheet_token,
  79. sheet_id=res_sheet_id,
  80. data=result,
  81. start_row=write_start_row,
  82. start_column=write_start_col,
  83. end_column=write_end_col
  84. )
  85. log_.info(f"write to feishu success!")
  86. write_start_row += 1
  87. continue
  88. # 4. 判断asr识别的文本是否有效
  89. validity = asr_validity_discrimination(text=asr_res_initial)
  90. log_.info(f"validity = {validity}")
  91. if validity is True:
  92. # 5. 对asr结果进行清洗
  93. asr_res = asr_res_initial.strip().replace('\n', '')
  94. for stop_word in config_.STOP_WORDS:
  95. asr_res = asr_res.replace(stop_word, '')
  96. # token限制: 字数 <= 2500
  97. asr_res = asr_res[-2500:]
  98. # 6. gpt产出结果
  99. prompt = f"{config_.GPT_PROMPT['tags']['prompt5']}{asr_res.strip()}"
  100. # gpt_res = get_tag(prompt=prompt)
  101. gpt_res = request_gpt(prompt=prompt)
  102. # print(gpt_res)
  103. log_.info(f"gpt_res = {gpt_res}, type = {type(gpt_res)}")
  104. # 7. 结果写入飞书表格
  105. if gpt_res is None:
  106. result = [[video_id, video_path, video['title'], str(validity), asr_res_initial, prompt, '', '']]
  107. else:
  108. confidence_up_list = []
  109. try:
  110. for item in json.loads(gpt_res):
  111. if item['confidence'] > 0.5:
  112. confidence_up_list.append(item['category'])
  113. except:
  114. pass
  115. confidence_up = ', '.join(confidence_up_list)
  116. result = [[video_id, video_path, video['title'], str(validity), asr_res_initial,
  117. prompt, gpt_res, confidence_up]]
  118. else:
  119. result = [[video_id, video_path, video['title'], str(validity), asr_res_initial, '', '', '']]
  120. log_.info(f"result = {result}")
  121. if len(result) > 0:
  122. feishu_helper.data_to_feishu_sheet(
  123. sheet_token=res_spreadsheet_token,
  124. sheet_id=res_sheet_id,
  125. data=result,
  126. start_row=write_start_row,
  127. start_column=write_start_col,
  128. end_column=write_end_col
  129. )
  130. log_.info(f"write to feishu success!")
  131. write_start_row += 1
  132. except Exception as e:
  133. log_.error(e)
  134. log_.error(traceback.format_exc())
  135. continue
  136. if __name__ == '__main__':
  137. sheet_info = {
  138. '历史视频top5000回流倒叙排列': {
  139. 'video_spreadsheet_token': 'L4ywsRaV2hFLv1t4Athcdw71nde',
  140. 'video_sheet_id': 'hRjMrL',
  141. 'read_start_row': 1456,
  142. 'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
  143. 'res_sheet_id': '7Fua00',
  144. 'write_start_row': 1455,
  145. 'write_start_col': 'A',
  146. 'write_end_col': 'H'
  147. }
  148. }
  149. for sheet_tag, sheet_item in sheet_info.items():
  150. print(sheet_tag)
  151. main(sheet_info_config=sheet_item)
  152. # video_path = download_video(
  153. # video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd',
  154. # video_id='001', download_folder='videos', ftype='mp4')
  155. # print(video_path)
  156. # # 3. 获取视频中的音频
  157. # audio_path = get_wav(video_path=video_path)
  158. # print(audio_path)
  159. # log_.info(f"audio_path = {audio_path}")
  160. # # 4. asr
  161. # asr_res = call_asr(audio_path=audio_path)
  162. # print(asr_res)
  163. # log_.info(f"asr_res = {asr_res}")
  164. # # 5. gpt产出结果
  165. # gpt_res = get_tag(text=asr_res)
  166. # print(gpt_res)