main_process.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. import traceback
  2. from feishu import FeiShuHelper
  3. from audio_process import get_wav
  4. from xunfei_asr import call_asr
  5. from utils import download_video, asr_validity_discrimination
  6. from gpt_tag import get_tag
  7. from config import set_config
  8. from log import Log
  9. config_ = set_config()
  10. log_ = Log()
  11. def main(sheet_info_config):
  12. video_spreadsheet_token = sheet_info_config['video_spreadsheet_token']
  13. video_sheet_id = sheet_info_config['video_sheet_id']
  14. read_start_row = sheet_info_config['read_start_row']
  15. res_spreadsheet_token = sheet_info_config['res_spreadsheet_token']
  16. res_sheet_id = sheet_info_config['res_sheet_id']
  17. write_start_row = sheet_info_config['write_start_row']
  18. write_start_col = sheet_info_config['write_start_col']
  19. write_end_col = sheet_info_config['write_end_col']
  20. # 1. 读取飞书表格,获取视频url和videoId
  21. feishu_helper = FeiShuHelper()
  22. data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id)
  23. videos = []
  24. for item in data[read_start_row:]:
  25. print(item)
  26. try:
  27. videos.append(
  28. {
  29. 'videoId': item[0],
  30. 'title': item[1],
  31. 'videoPath': item[3][0]['text'],
  32. }
  33. )
  34. except:
  35. continue
  36. log_.info(f"videos count: {len(videos)}")
  37. result = []
  38. for i, video in enumerate(videos):
  39. try:
  40. log_.info(f"i = {i}, video = {video}")
  41. # 1. 下载视频
  42. video_id = video['videoId']
  43. video_path = video['videoPath']
  44. video_file = download_video(video_path=video_path, video_id=video_id, download_folder='videos')
  45. # print(video_file)
  46. log_.info(f"video_path = {video_file}")
  47. # 2. 获取视频中的音频
  48. audio_path = get_wav(video_path=video_file)
  49. # print(audio_path)
  50. log_.info(f"audio_path = {audio_path}")
  51. # 3. asr
  52. dialogue_path, asr_res_initial = call_asr(audio_path=audio_path)
  53. # print(asr_res)
  54. log_.info(f"asr_res_initial = {asr_res_initial}")
  55. # 4. 判断asr识别的文本是否有效
  56. validity = asr_validity_discrimination(text=asr_res_initial)
  57. log_.info(f"validity = {validity}")
  58. # 5. 对asr结果进行清洗
  59. asr_res = asr_res_initial.strip().replace('\n', '')
  60. for stop_word in config_.STOP_WORDS:
  61. asr_res = asr_res.replace(stop_word, '')
  62. # token限制: 字数 <= 2500
  63. asr_res = asr_res[-2500:]
  64. # 6. gpt产出结果
  65. prompt = f"{config_.GPT_PROMPT['tags']['prompt4']}{asr_res.strip()}"
  66. gpt_res = get_tag(prompt=prompt)
  67. # print(gpt_res)
  68. log_.info(f"gpt_res = {gpt_res}")
  69. # 7. 结果写入飞书表格
  70. result = [[video_id, video_path, video['title'], str(validity), asr_res_initial, gpt_res, prompt]]
  71. log_.info(f"result = {result}")
  72. if len(result) > 0:
  73. feishu_helper.data_to_feishu_sheet(
  74. sheet_token=res_spreadsheet_token,
  75. sheet_id=res_sheet_id,
  76. data=result,
  77. start_row=write_start_row,
  78. start_column=write_start_col,
  79. end_column=write_end_col
  80. )
  81. log_.info(f"write to feishu success!")
  82. write_start_row += 1
  83. except Exception as e:
  84. log_.error(e)
  85. log_.error(traceback.format_exc())
  86. continue
  87. # 6. 结果写入飞书表格
  88. # if len(result) > 0:
  89. # feishu_helper.data_to_feishu_sheet(
  90. # sheet_token=res_spreadsheet_token,
  91. # sheet_id=res_sheet_id,
  92. # data=result,
  93. # start_row=write_start_row,
  94. # start_column=write_start_col,
  95. # end_column=write_end_col
  96. # )
  97. if __name__ == '__main__':
  98. sheet_info = {
  99. '历史视频top5000回流倒叙排列': {
  100. 'video_spreadsheet_token': 'L4ywsRaV2hFLv1t4Athcdw71nde',
  101. 'video_sheet_id': 'hRjMrL',
  102. 'read_start_row': 2,
  103. 'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
  104. 'res_sheet_id': '7Fua00',
  105. 'write_start_row': 3,
  106. 'write_start_col': 'A',
  107. 'write_end_col': 'I'
  108. }
  109. }
  110. for sheet_tag, sheet_item in sheet_info.items():
  111. print(sheet_tag)
  112. main(sheet_info_config=sheet_item)
  113. # video_path = download_video(
  114. # video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd',
  115. # video_id='001', download_folder='videos', ftype='mp4')
  116. # print(video_path)
  117. # # 3. 获取视频中的音频
  118. # audio_path = get_wav(video_path=video_path)
  119. # print(audio_path)
  120. # log_.info(f"audio_path = {audio_path}")
  121. # # 4. asr
  122. # asr_res = call_asr(audio_path=audio_path)
  123. # print(asr_res)
  124. # log_.info(f"asr_res = {asr_res}")
  125. # # 5. gpt产出结果
  126. # gpt_res = get_tag(text=asr_res)
  127. # print(gpt_res)