|
@@ -0,0 +1,151 @@
|
|
|
+import json
|
|
|
+import traceback
|
|
|
+
|
|
|
+from feishu import FeiShuHelper
|
|
|
+from audio_process import get_wav
|
|
|
+from xunfei_asr import call_asr
|
|
|
+from utils import download_video, asr_validity_discrimination
|
|
|
+from gpt_tag import get_tag, request_gpt
|
|
|
+from config import set_config
|
|
|
+from log import Log
|
|
|
+config_ = set_config()
|
|
|
+log_ = Log()
|
|
|
+
|
|
|
+
|
|
|
+def main(sheet_info_config):
|
|
|
+ video_spreadsheet_token = sheet_info_config['video_spreadsheet_token']
|
|
|
+ video_sheet_id = sheet_info_config['video_sheet_id']
|
|
|
+ read_start_row = sheet_info_config['read_start_row']
|
|
|
+ res_spreadsheet_token = sheet_info_config['res_spreadsheet_token']
|
|
|
+ res_sheet_id = sheet_info_config['res_sheet_id']
|
|
|
+ write_start_row = sheet_info_config['write_start_row']
|
|
|
+ write_start_col = sheet_info_config['write_start_col']
|
|
|
+ write_end_col = sheet_info_config['write_end_col']
|
|
|
+
|
|
|
+ # 1. 读取飞书表格,获取视频asr_res和title
|
|
|
+ feishu_helper = FeiShuHelper()
|
|
|
+ data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id)
|
|
|
+ videos = []
|
|
|
+ for item in data[read_start_row:]:
|
|
|
+ # print(item)
|
|
|
+ # print({'videoId': item[0], 'title': item[3], 'asrRes': item[2]})
|
|
|
+ try:
|
|
|
+ videos.append(
|
|
|
+ {
|
|
|
+ 'videoId': item[0],
|
|
|
+ 'title': item[3],
|
|
|
+ 'asrRes': item[2],
|
|
|
+ }
|
|
|
+ )
|
|
|
+ except:
|
|
|
+ continue
|
|
|
+ log_.info(f"videos count: {len(videos)}")
|
|
|
+
|
|
|
+ result = []
|
|
|
+ for i, video in enumerate(videos):
|
|
|
+ try:
|
|
|
+ log_.info(f"i = {i}, video = {video}")
|
|
|
+ asr_res_initial = video['asrRes']
|
|
|
+ title = video['title']
|
|
|
+ # 2. 判断asr识别的文本是否有效
|
|
|
+ validity = asr_validity_discrimination(text=asr_res_initial)
|
|
|
+ log_.info(f"validity = {validity}")
|
|
|
+ if validity is True:
|
|
|
+ # 3. 对asr结果进行清洗
|
|
|
+ asr_res = asr_res_initial.strip().replace('\n', '')
|
|
|
+ for stop_word in config_.STOP_WORDS:
|
|
|
+ asr_res = asr_res.replace(stop_word, '')
|
|
|
+ # token限制: 字数 <= 2500
|
|
|
+ asr_res = asr_res[-2500:]
|
|
|
+
|
|
|
+ # 4. gpt产出结果
|
|
|
+ # 4.1 gpt产出summary, keywords,
|
|
|
+ prompt1 = f"{config_.GPT_PROMPT['tags']['prompt6']}{asr_res.strip()}"
|
|
|
+ # gpt_res = get_tag(prompt=prompt)
|
|
|
+ gpt_res1 = request_gpt(prompt=prompt1)
|
|
|
+ # print(gpt_res)
|
|
|
+ log_.info(f"gpt_res1 = {gpt_res1}, type = {type(gpt_res1)}")
|
|
|
+ if gpt_res1 is None:
|
|
|
+ result = [[str(validity), prompt1, '', '', '', '', '']]
|
|
|
+ else:
|
|
|
+ result = [[str(validity), prompt1, gpt_res1]]
|
|
|
+ # 4.2 获取summary, keywords, title进行分类
|
|
|
+ try:
|
|
|
+ gpt_res1_json = json.loads(gpt_res1)
|
|
|
+ summary = gpt_res1_json['summary']
|
|
|
+ keywords = gpt_res1_json['keywords']
|
|
|
+ result[0].extend([summary, str(keywords)])
|
|
|
+ prompt2_param = f"标题:{title}\n概况:{summary}\n关键词:{keywords}"
|
|
|
+ prompt2 = f"{config_.GPT_PROMPT['tags']['prompt7']}{prompt2_param}"
|
|
|
+ log_.info(f"prompt2: {prompt2}")
|
|
|
+ gpt_res2 = request_gpt(prompt=prompt2)
|
|
|
+ log_.info(f"gpt_res2 = {gpt_res2}, type = {type(gpt_res2)}")
|
|
|
+
|
|
|
+ # 5. 结果写入飞书表格
|
|
|
+ if gpt_res2 is None:
|
|
|
+ result[0].extend(['', '', ''])
|
|
|
+ else:
|
|
|
+ confidence_up_list = []
|
|
|
+ try:
|
|
|
+ for item in json.loads(gpt_res2):
|
|
|
+ if item['confidence'] > 0.5:
|
|
|
+ confidence_up_list.append(item['category'])
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ confidence_up = ', '.join(confidence_up_list)
|
|
|
+ result[0].extend([prompt2, gpt_res2, confidence_up])
|
|
|
+ except:
|
|
|
+ result[0].extend(['', '', '', '', ''])
|
|
|
+ else:
|
|
|
+ result = [[str(validity), '', '', '', '', '', '', '']]
|
|
|
+ log_.info(f"result = {result}")
|
|
|
+ if len(result) > 0:
|
|
|
+ feishu_helper.update_values(
|
|
|
+ sheet_token=res_spreadsheet_token,
|
|
|
+ sheet_id=res_sheet_id,
|
|
|
+ data=result,
|
|
|
+ start_row=write_start_row,
|
|
|
+ start_column=write_start_col,
|
|
|
+ end_column=write_end_col
|
|
|
+ )
|
|
|
+ log_.info(f"write to feishu success!")
|
|
|
+ write_start_row += 1
|
|
|
+ except Exception as e:
|
|
|
+ log_.error(e)
|
|
|
+ log_.error(traceback.format_exc())
|
|
|
+ continue
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ sheet_info = {
|
|
|
+ 'top100新promt-0605': {
|
|
|
+ 'video_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
|
|
|
+ 'video_sheet_id': 'tbd971',
|
|
|
+ 'read_start_row': 1,
|
|
|
+ 'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
|
|
|
+ 'res_sheet_id': 'tbd971',
|
|
|
+ 'write_start_row': 2,
|
|
|
+ 'write_start_col': 'E',
|
|
|
+ 'write_end_col': 'L'
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for sheet_tag, sheet_item in sheet_info.items():
|
|
|
+ print(sheet_tag)
|
|
|
+ main(sheet_info_config=sheet_item)
|
|
|
+
|
|
|
+ # video_path = download_video(
|
|
|
+ # video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd',
|
|
|
+ # video_id='001', download_folder='videos', ftype='mp4')
|
|
|
+ # print(video_path)
|
|
|
+ # # 3. 获取视频中的音频
|
|
|
+ # audio_path = get_wav(video_path=video_path)
|
|
|
+ # print(audio_path)
|
|
|
+ # log_.info(f"audio_path = {audio_path}")
|
|
|
+ # # 4. asr
|
|
|
+ # asr_res = call_asr(audio_path=audio_path)
|
|
|
+ # print(asr_res)
|
|
|
+ # log_.info(f"asr_res = {asr_res}")
|
|
|
+ # # 5. gpt产出结果
|
|
|
+ # gpt_res = get_tag(text=asr_res)
|
|
|
+ # print(gpt_res)
|