temporary_process.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. import json
  2. import traceback
  3. from feishu import FeiShuHelper
  4. from audio_process import get_wav
  5. from xunfei_asr import call_asr
  6. from utils import download_video, asr_validity_discrimination
  7. from gpt_tag import get_tag, request_gpt
  8. from config import set_config
  9. from log import Log
  10. config_ = set_config()
  11. log_ = Log()
  12. def main(sheet_info_config):
  13. video_spreadsheet_token = sheet_info_config['video_spreadsheet_token']
  14. video_sheet_id = sheet_info_config['video_sheet_id']
  15. read_start_row = sheet_info_config['read_start_row']
  16. res_spreadsheet_token = sheet_info_config['res_spreadsheet_token']
  17. res_sheet_id = sheet_info_config['res_sheet_id']
  18. write_start_row = sheet_info_config['write_start_row']
  19. write_start_col = sheet_info_config['write_start_col']
  20. write_end_col = sheet_info_config['write_end_col']
  21. # 1. 读取飞书表格,获取视频asr_res和title
  22. feishu_helper = FeiShuHelper()
  23. data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id)
  24. videos = []
  25. for item in data[read_start_row:]:
  26. # print(item)
  27. # print({'videoId': item[0], 'title': item[3], 'asrRes': item[2]})
  28. try:
  29. videos.append(
  30. {
  31. 'videoId': item[0],
  32. 'title': item[3],
  33. 'asrRes': item[2],
  34. }
  35. )
  36. except:
  37. continue
  38. log_.info(f"videos count: {len(videos)}")
  39. result = []
  40. for i, video in enumerate(videos):
  41. try:
  42. log_.info(f"i = {i}, video = {video}")
  43. asr_res_initial = video['asrRes']
  44. title = video['title']
  45. # 2. 判断asr识别的文本是否有效
  46. validity = asr_validity_discrimination(text=asr_res_initial)
  47. log_.info(f"validity = {validity}")
  48. if validity is True:
  49. # 3. 对asr结果进行清洗
  50. asr_res = asr_res_initial.strip().replace('\n', '')
  51. for stop_word in config_.STOP_WORDS:
  52. asr_res = asr_res.replace(stop_word, '')
  53. # token限制: 字数 <= 2500
  54. asr_res = asr_res[-2500:]
  55. # 4. gpt产出结果
  56. # 4.1 gpt产出summary, keywords,
  57. prompt1 = f"{config_.GPT_PROMPT['tags']['prompt6']}{asr_res.strip()}"
  58. # gpt_res = get_tag(prompt=prompt)
  59. gpt_res1 = request_gpt(prompt=prompt1)
  60. # print(gpt_res)
  61. log_.info(f"gpt_res1 = {gpt_res1}, type = {type(gpt_res1)}")
  62. if gpt_res1 is None:
  63. result = [[str(validity), prompt1, '', '', '', '', '']]
  64. else:
  65. result = [[str(validity), prompt1, gpt_res1]]
  66. # 4.2 获取summary, keywords, title进行分类
  67. try:
  68. gpt_res1_json = json.loads(gpt_res1)
  69. summary = gpt_res1_json['summary']
  70. keywords = gpt_res1_json['keywords']
  71. result[0].extend([summary, str(keywords)])
  72. prompt2_param = f"标题:{title}\n概况:{summary}\n关键词:{keywords}"
  73. prompt2 = f"{config_.GPT_PROMPT['tags']['prompt7']}{prompt2_param}"
  74. log_.info(f"prompt2: {prompt2}")
  75. gpt_res2 = request_gpt(prompt=prompt2)
  76. log_.info(f"gpt_res2 = {gpt_res2}, type = {type(gpt_res2)}")
  77. # 5. 结果写入飞书表格
  78. if gpt_res2 is None:
  79. result[0].extend(['', '', ''])
  80. else:
  81. confidence_up_list = []
  82. try:
  83. for item in json.loads(gpt_res2):
  84. if item['confidence'] > 0.5:
  85. confidence_up_list.append(item['category'])
  86. except:
  87. pass
  88. confidence_up = ', '.join(confidence_up_list)
  89. result[0].extend([prompt2, gpt_res2, confidence_up])
  90. except:
  91. result[0].extend(['', '', '', '', ''])
  92. else:
  93. result = [[str(validity), '', '', '', '', '', '', '']]
  94. log_.info(f"result = {result}")
  95. if len(result) > 0:
  96. feishu_helper.update_values(
  97. sheet_token=res_spreadsheet_token,
  98. sheet_id=res_sheet_id,
  99. data=result,
  100. start_row=write_start_row,
  101. start_column=write_start_col,
  102. end_column=write_end_col
  103. )
  104. log_.info(f"write to feishu success!")
  105. write_start_row += 1
  106. except Exception as e:
  107. log_.error(e)
  108. log_.error(traceback.format_exc())
  109. continue
  110. if __name__ == '__main__':
  111. sheet_info = {
  112. 'top100新promt-0605': {
  113. 'video_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
  114. 'video_sheet_id': 'tbd971',
  115. 'read_start_row': 1,
  116. 'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
  117. 'res_sheet_id': 'tbd971',
  118. 'write_start_row': 2,
  119. 'write_start_col': 'E',
  120. 'write_end_col': 'L'
  121. }
  122. }
  123. for sheet_tag, sheet_item in sheet_info.items():
  124. print(sheet_tag)
  125. main(sheet_info_config=sheet_item)
  126. # video_path = download_video(
  127. # video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd',
  128. # video_id='001', download_folder='videos', ftype='mp4')
  129. # print(video_path)
  130. # # 3. 获取视频中的音频
  131. # audio_path = get_wav(video_path=video_path)
  132. # print(audio_path)
  133. # log_.info(f"audio_path = {audio_path}")
  134. # # 4. asr
  135. # asr_res = call_asr(audio_path=audio_path)
  136. # print(asr_res)
  137. # log_.info(f"asr_res = {asr_res}")
  138. # # 5. gpt产出结果
  139. # gpt_res = get_tag(text=asr_res)
  140. # print(gpt_res)