import json import traceback from feishu import FeiShuHelper from audio_process import get_wav from xunfei_asr import call_asr from utils import download_video, asr_validity_discrimination from gpt_tag import get_tag, request_gpt from config import set_config from log import Log config_ = set_config() log_ = Log() def main(sheet_info_config): video_spreadsheet_token = sheet_info_config['video_spreadsheet_token'] video_sheet_id = sheet_info_config['video_sheet_id'] read_start_row = sheet_info_config['read_start_row'] res_spreadsheet_token = sheet_info_config['res_spreadsheet_token'] res_sheet_id = sheet_info_config['res_sheet_id'] write_start_row = sheet_info_config['write_start_row'] write_start_col = sheet_info_config['write_start_col'] write_end_col = sheet_info_config['write_end_col'] # 1. 读取飞书表格,获取视频url和videoId feishu_helper = FeiShuHelper() data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id) videos = [] for item in data[read_start_row:]: print(item) try: videos.append( { 'videoId': item[0], 'title': item[1], 'videoPath': item[3][0]['text'], } ) except: continue log_.info(f"videos count: {len(videos)}") result = [] for i, video in enumerate(videos): try: log_.info(f"i = {i}, video = {video}") # 1. 下载视频 video_id = video['videoId'] video_path = video['videoPath'] if video_path[-4:] != '.mp4': result = [[video_id, video_path, video['title'], '', '', '', '', '']] log_.info(f"result = {result}") if len(result) > 0: feishu_helper.data_to_feishu_sheet( sheet_token=res_spreadsheet_token, sheet_id=res_sheet_id, data=result, start_row=write_start_row, start_column=write_start_col, end_column=write_end_col ) log_.info(f"write to feishu success!") write_start_row += 1 else: try: video_file = download_video(video_path=video_path, video_id=video_id, download_folder='videos') # print(video_file) log_.info(f"video_path = {video_file}") # 2. 获取视频中的音频 audio_path = get_wav(video_path=video_file) # print(audio_path) log_.info(f"audio_path = {audio_path}") # 3. asr dialogue_path, asr_res_initial = call_asr(audio_path=audio_path) # print(asr_res) log_.info(f"asr_res_initial = {asr_res_initial}") except: log_.error(traceback.format_exc()) result = [[video_id, video_path, video['title'], '', '', '', '', '']] log_.info(f"result = {result}") if len(result) > 0: feishu_helper.data_to_feishu_sheet( sheet_token=res_spreadsheet_token, sheet_id=res_sheet_id, data=result, start_row=write_start_row, start_column=write_start_col, end_column=write_end_col ) log_.info(f"write to feishu success!") write_start_row += 1 continue # 4. 判断asr识别的文本是否有效 validity = asr_validity_discrimination(text=asr_res_initial) log_.info(f"validity = {validity}") if validity is True: # 5. 对asr结果进行清洗 asr_res = asr_res_initial.strip().replace('\n', '') for stop_word in config_.STOP_WORDS: asr_res = asr_res.replace(stop_word, '') # token限制: 字数 <= 2500 asr_res = asr_res[-2500:] # 6. gpt产出结果 prompt = f"{config_.GPT_PROMPT['tags']['prompt5']}{asr_res.strip()}" # gpt_res = get_tag(prompt=prompt) gpt_res = request_gpt(prompt=prompt) # print(gpt_res) log_.info(f"gpt_res = {gpt_res}, type = {type(gpt_res)}") # 7. 结果写入飞书表格 if gpt_res is None: result = [[video_id, video_path, video['title'], str(validity), asr_res_initial, prompt, '', '']] else: confidence_up_list = [] try: for item in json.loads(gpt_res): if item['confidence'] > 0.5: confidence_up_list.append(item['category']) except: pass confidence_up = ', '.join(confidence_up_list) result = [[video_id, video_path, video['title'], str(validity), asr_res_initial, prompt, gpt_res, confidence_up]] else: result = [[video_id, video_path, video['title'], str(validity), asr_res_initial, '', '', '']] log_.info(f"result = {result}") if len(result) > 0: feishu_helper.data_to_feishu_sheet( sheet_token=res_spreadsheet_token, sheet_id=res_sheet_id, data=result, start_row=write_start_row, start_column=write_start_col, end_column=write_end_col ) log_.info(f"write to feishu success!") write_start_row += 1 except Exception as e: log_.error(e) log_.error(traceback.format_exc()) continue if __name__ == '__main__': sheet_info = { '历史视频top5000回流倒叙排列': { 'video_spreadsheet_token': 'L4ywsRaV2hFLv1t4Athcdw71nde', 'video_sheet_id': 'hRjMrL', 'read_start_row': 1456, 'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante', 'res_sheet_id': '7Fua00', 'write_start_row': 1455, 'write_start_col': 'A', 'write_end_col': 'H' } } for sheet_tag, sheet_item in sheet_info.items(): print(sheet_tag) main(sheet_info_config=sheet_item) # video_path = download_video( # video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd', # video_id='001', download_folder='videos', ftype='mp4') # print(video_path) # # 3. 获取视频中的音频 # audio_path = get_wav(video_path=video_path) # print(audio_path) # log_.info(f"audio_path = {audio_path}") # # 4. asr # asr_res = call_asr(audio_path=audio_path) # print(asr_res) # log_.info(f"asr_res = {asr_res}") # # 5. gpt产出结果 # gpt_res = get_tag(text=asr_res) # print(gpt_res)