import json import traceback from feishu import FeiShuHelper from audio_process import get_wav from xunfei_asr import call_asr from utils import download_video, asr_validity_discrimination from gpt_tag import get_tag, request_gpt from config import set_config from log import Log config_ = set_config() log_ = Log() def main(sheet_info_config): video_spreadsheet_token = sheet_info_config['video_spreadsheet_token'] video_sheet_id = sheet_info_config['video_sheet_id'] read_start_row = sheet_info_config['read_start_row'] res_spreadsheet_token = sheet_info_config['res_spreadsheet_token'] res_sheet_id = sheet_info_config['res_sheet_id'] write_start_row = sheet_info_config['write_start_row'] write_start_col = sheet_info_config['write_start_col'] write_end_col = sheet_info_config['write_end_col'] # 1. 读取飞书表格,获取视频asr_res和title feishu_helper = FeiShuHelper() data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id) videos = [] for item in data[read_start_row:]: # print(item) # print({'videoId': item[0], 'title': item[3], 'asrRes': item[2]}) try: videos.append( { 'videoId': item[0], 'title': item[3], 'asrRes': item[2], } ) except: continue log_.info(f"videos count: {len(videos)}") result = [] for i, video in enumerate(videos): try: log_.info(f"i = {i}, video = {video}") asr_res_initial = video['asrRes'] title = video['title'] # 2. 判断asr识别的文本是否有效 validity = asr_validity_discrimination(text=asr_res_initial) log_.info(f"validity = {validity}") if validity is True: # 3. 对asr结果进行清洗 asr_res = asr_res_initial.strip().replace('\n', '') for stop_word in config_.STOP_WORDS: asr_res = asr_res.replace(stop_word, '') # token限制: 字数 <= 2500 asr_res = asr_res[-2500:] # 4. gpt产出结果 # 4.1 gpt产出summary, keywords, prompt1 = f"{config_.GPT_PROMPT['tags']['prompt6']}{asr_res.strip()}" # gpt_res = get_tag(prompt=prompt) gpt_res1 = request_gpt(prompt=prompt1) # print(gpt_res) log_.info(f"gpt_res1 = {gpt_res1}, type = {type(gpt_res1)}") if gpt_res1 is None: result = [[str(validity), prompt1, '', '', '', '', '']] else: result = [[str(validity), prompt1, gpt_res1]] # 4.2 获取summary, keywords, title进行分类 try: gpt_res1_json = json.loads(gpt_res1) summary = gpt_res1_json['summary'] keywords = gpt_res1_json['keywords'] result[0].extend([summary, str(keywords)]) prompt2_param = f"标题:{title}\n概况:{summary}\n关键词:{keywords}" prompt2 = f"{config_.GPT_PROMPT['tags']['prompt7']}{prompt2_param}" log_.info(f"prompt2: {prompt2}") gpt_res2 = request_gpt(prompt=prompt2) log_.info(f"gpt_res2 = {gpt_res2}, type = {type(gpt_res2)}") # 5. 结果写入飞书表格 if gpt_res2 is None: result[0].extend(['', '', '']) else: confidence_up_list = [] try: for item in json.loads(gpt_res2): if item['confidence'] > 0.5: confidence_up_list.append(item['category']) except: pass confidence_up = ', '.join(confidence_up_list) result[0].extend([prompt2, gpt_res2, confidence_up]) except: result[0].extend(['', '', '', '', '']) else: result = [[str(validity), '', '', '', '', '', '', '']] log_.info(f"result = {result}") if len(result) > 0: feishu_helper.update_values( sheet_token=res_spreadsheet_token, sheet_id=res_sheet_id, data=result, start_row=write_start_row, start_column=write_start_col, end_column=write_end_col ) log_.info(f"write to feishu success!") write_start_row += 1 except Exception as e: log_.error(e) log_.error(traceback.format_exc()) continue if __name__ == '__main__': sheet_info = { 'top100新promt-0605': { 'video_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante', 'video_sheet_id': 'tbd971', 'read_start_row': 1, 'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante', 'res_sheet_id': 'tbd971', 'write_start_row': 2, 'write_start_col': 'E', 'write_end_col': 'L' } } for sheet_tag, sheet_item in sheet_info.items(): print(sheet_tag) main(sheet_info_config=sheet_item) # video_path = download_video( # video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd', # video_id='001', download_folder='videos', ftype='mp4') # print(video_path) # # 3. 获取视频中的音频 # audio_path = get_wav(video_path=video_path) # print(audio_path) # log_.info(f"audio_path = {audio_path}") # # 4. asr # asr_res = call_asr(audio_path=audio_path) # print(asr_res) # log_.info(f"asr_res = {asr_res}") # # 5. gpt产出结果 # gpt_res = get_tag(text=asr_res) # print(gpt_res)