123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 |
- import json
- import traceback
- from feishu import FeiShuHelper
- from audio_process import get_wav
- from xunfei_asr import call_asr
- from utils import download_video, asr_validity_discrimination
- from gpt_tag import get_tag, request_gpt
- from config import set_config
- from log import Log
- config_ = set_config()
- log_ = Log()
- def main(sheet_info_config):
- video_spreadsheet_token = sheet_info_config['video_spreadsheet_token']
- video_sheet_id = sheet_info_config['video_sheet_id']
- read_start_row = sheet_info_config['read_start_row']
- res_spreadsheet_token = sheet_info_config['res_spreadsheet_token']
- res_sheet_id = sheet_info_config['res_sheet_id']
- write_start_row = sheet_info_config['write_start_row']
- write_start_col = sheet_info_config['write_start_col']
- write_end_col = sheet_info_config['write_end_col']
- # 1. 读取飞书表格,获取视频url和videoId
- feishu_helper = FeiShuHelper()
- data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id)
- videos = []
- for item in data[read_start_row:]:
- print(item)
- try:
- videos.append(
- {
- 'videoId': item[0],
- 'title': item[1],
- 'videoPath': item[3][0]['text'],
- }
- )
- except:
- continue
- log_.info(f"videos count: {len(videos)}")
- result = []
- for i, video in enumerate(videos):
- try:
- log_.info(f"i = {i}, video = {video}")
- # 1. 下载视频
- video_id = video['videoId']
- video_path = video['videoPath']
- if video_path[-4:] != '.mp4':
- result = [[video_id, video_path, video['title'], '', '', '', '', '']]
- log_.info(f"result = {result}")
- if len(result) > 0:
- feishu_helper.data_to_feishu_sheet(
- sheet_token=res_spreadsheet_token,
- sheet_id=res_sheet_id,
- data=result,
- start_row=write_start_row,
- start_column=write_start_col,
- end_column=write_end_col
- )
- log_.info(f"write to feishu success!")
- write_start_row += 1
- else:
- try:
- video_file = download_video(video_path=video_path, video_id=video_id, download_folder='videos')
- # print(video_file)
- log_.info(f"video_path = {video_file}")
- # 2. 获取视频中的音频
- audio_path = get_wav(video_path=video_file)
- # print(audio_path)
- log_.info(f"audio_path = {audio_path}")
- # 3. asr
- dialogue_path, asr_res_initial = call_asr(audio_path=audio_path)
- # print(asr_res)
- log_.info(f"asr_res_initial = {asr_res_initial}")
- except:
- log_.error(traceback.format_exc())
- result = [[video_id, video_path, video['title'], '', '', '', '', '']]
- log_.info(f"result = {result}")
- if len(result) > 0:
- feishu_helper.data_to_feishu_sheet(
- sheet_token=res_spreadsheet_token,
- sheet_id=res_sheet_id,
- data=result,
- start_row=write_start_row,
- start_column=write_start_col,
- end_column=write_end_col
- )
- log_.info(f"write to feishu success!")
- write_start_row += 1
- continue
- # 4. 判断asr识别的文本是否有效
- validity = asr_validity_discrimination(text=asr_res_initial)
- log_.info(f"validity = {validity}")
- if validity is True:
- # 5. 对asr结果进行清洗
- asr_res = asr_res_initial.strip().replace('\n', '')
- for stop_word in config_.STOP_WORDS:
- asr_res = asr_res.replace(stop_word, '')
- # token限制: 字数 <= 2500
- asr_res = asr_res[-2500:]
- # 6. gpt产出结果
- prompt = f"{config_.GPT_PROMPT['tags']['prompt5']}{asr_res.strip()}"
- # gpt_res = get_tag(prompt=prompt)
- gpt_res = request_gpt(prompt=prompt)
- # print(gpt_res)
- log_.info(f"gpt_res = {gpt_res}, type = {type(gpt_res)}")
- # 7. 结果写入飞书表格
- if gpt_res is None:
- result = [[video_id, video_path, video['title'], str(validity), asr_res_initial, prompt, '', '']]
- else:
- confidence_up_list = []
- try:
- for item in json.loads(gpt_res):
- if item['confidence'] > 0.5:
- confidence_up_list.append(item['category'])
- except:
- pass
- confidence_up = ', '.join(confidence_up_list)
- result = [[video_id, video_path, video['title'], str(validity), asr_res_initial,
- prompt, gpt_res, confidence_up]]
- else:
- result = [[video_id, video_path, video['title'], str(validity), asr_res_initial, '', '', '']]
- log_.info(f"result = {result}")
- if len(result) > 0:
- feishu_helper.data_to_feishu_sheet(
- sheet_token=res_spreadsheet_token,
- sheet_id=res_sheet_id,
- data=result,
- start_row=write_start_row,
- start_column=write_start_col,
- end_column=write_end_col
- )
- log_.info(f"write to feishu success!")
- write_start_row += 1
- except Exception as e:
- log_.error(e)
- log_.error(traceback.format_exc())
- continue
- if __name__ == '__main__':
- sheet_info = {
- '历史视频top5000回流倒叙排列': {
- 'video_spreadsheet_token': 'L4ywsRaV2hFLv1t4Athcdw71nde',
- 'video_sheet_id': 'hRjMrL',
- 'read_start_row': 1456,
- 'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
- 'res_sheet_id': '7Fua00',
- 'write_start_row': 1455,
- 'write_start_col': 'A',
- 'write_end_col': 'H'
- }
- }
- for sheet_tag, sheet_item in sheet_info.items():
- print(sheet_tag)
- main(sheet_info_config=sheet_item)
- # video_path = download_video(
- # video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd',
- # video_id='001', download_folder='videos', ftype='mp4')
- # print(video_path)
- # # 3. 获取视频中的音频
- # audio_path = get_wav(video_path=video_path)
- # print(audio_path)
- # log_.info(f"audio_path = {audio_path}")
- # # 4. asr
- # asr_res = call_asr(audio_path=audio_path)
- # print(asr_res)
- # log_.info(f"asr_res = {asr_res}")
- # # 5. gpt产出结果
- # gpt_res = get_tag(text=asr_res)
- # print(gpt_res)
|