| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 | import jsonimport tracebackfrom feishu import FeiShuHelperfrom audio_process import get_wavfrom xunfei_asr import call_asrfrom utils import download_video, asr_validity_discriminationfrom gpt_tag import get_tag, request_gptfrom config import set_configfrom log import Logconfig_ = set_config()log_ = Log()def main(sheet_info_config):    video_spreadsheet_token = sheet_info_config['video_spreadsheet_token']    video_sheet_id = sheet_info_config['video_sheet_id']    read_start_row = sheet_info_config['read_start_row']    res_spreadsheet_token = sheet_info_config['res_spreadsheet_token']    res_sheet_id = sheet_info_config['res_sheet_id']    write_start_row = sheet_info_config['write_start_row']    write_start_col = sheet_info_config['write_start_col']    write_end_col = sheet_info_config['write_end_col']    # 1. 读取飞书表格,获取视频asr_res和title    feishu_helper = FeiShuHelper()    data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id)    videos = []    for item in data[read_start_row:]:        # print(item)        # print({'videoId': item[0], 'title': item[3], 'asrRes': item[2]})        try:            videos.append(                {                    'videoId': item[0],                    'title': item[3],                    'asrRes': item[2],                }            )        except:            continue    log_.info(f"videos count: {len(videos)}")    result = []    for i, video in enumerate(videos):        try:            log_.info(f"i = {i}, video = {video}")            asr_res_initial = video['asrRes']            title = video['title']            # 2. 判断asr识别的文本是否有效            validity = asr_validity_discrimination(text=asr_res_initial)            log_.info(f"validity = {validity}")            if validity is True:                # 3. 对asr结果进行清洗                asr_res = asr_res_initial.strip().replace('\n', '')                for stop_word in config_.STOP_WORDS:                    asr_res = asr_res.replace(stop_word, '')                # token限制: 字数 <= 2500                asr_res = asr_res[-2500:]                # 4. gpt产出结果                # 4.1 gpt产出summary, keywords,                prompt1 = f"{config_.GPT_PROMPT['tags']['prompt6']}{asr_res.strip()}"                # gpt_res = get_tag(prompt=prompt)                gpt_res1 = request_gpt(prompt=prompt1)                # print(gpt_res)                log_.info(f"gpt_res1 = {gpt_res1}, type = {type(gpt_res1)}")                if gpt_res1 is None:                    result = [[str(validity), prompt1, '', '', '', '', '']]                else:                    result = [[str(validity), prompt1, gpt_res1]]                    # 4.2 获取summary, keywords, title进行分类                    try:                        gpt_res1_json = json.loads(gpt_res1)                        summary = gpt_res1_json['summary']                        keywords = gpt_res1_json['keywords']                        result[0].extend([summary, str(keywords)])                        prompt2_param = f"标题:{title}\n概况:{summary}\n关键词:{keywords}"                        prompt2 = f"{config_.GPT_PROMPT['tags']['prompt7']}{prompt2_param}"                        log_.info(f"prompt2: {prompt2}")                        gpt_res2 = request_gpt(prompt=prompt2)                        log_.info(f"gpt_res2 = {gpt_res2}, type = {type(gpt_res2)}")                        # 5. 结果写入飞书表格                        if gpt_res2 is None:                            result[0].extend(['', '', ''])                        else:                            confidence_up_list = []                            try:                                for item in json.loads(gpt_res2):                                    if item['confidence'] > 0.5:                                        confidence_up_list.append(item['category'])                            except:                                pass                            confidence_up = ', '.join(confidence_up_list)                            result[0].extend([prompt2, gpt_res2, confidence_up])                    except:                        result[0].extend(['', '', '', '', ''])            else:                result = [[str(validity), '', '', '', '', '', '', '']]            log_.info(f"result = {result}")            if len(result) > 0:                feishu_helper.update_values(                    sheet_token=res_spreadsheet_token,                    sheet_id=res_sheet_id,                    data=result,                    start_row=write_start_row,                    start_column=write_start_col,                    end_column=write_end_col                )                log_.info(f"write to feishu success!")                write_start_row += 1        except Exception as e:            log_.error(e)            log_.error(traceback.format_exc())            continueif __name__ == '__main__':    sheet_info = {        'top100新promt-0605': {            'video_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',            'video_sheet_id': 'tbd971',            'read_start_row': 1,            'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',            'res_sheet_id': 'tbd971',            'write_start_row': 2,            'write_start_col': 'E',            'write_end_col': 'L'        }    }    for sheet_tag, sheet_item in sheet_info.items():        print(sheet_tag)        main(sheet_info_config=sheet_item)    # video_path = download_video(    #     video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd',    #     video_id='001', download_folder='videos', ftype='mp4')    # print(video_path)    # # 3. 获取视频中的音频    # audio_path = get_wav(video_path=video_path)    # print(audio_path)    # log_.info(f"audio_path = {audio_path}")    # # 4. asr    # asr_res = call_asr(audio_path=audio_path)    # print(asr_res)    # log_.info(f"asr_res = {asr_res}")    # # 5. gpt产出结果    # gpt_res = get_tag(text=asr_res)    # print(gpt_res)
 |