2 年之前 · 0659fd74d8
--- a/config.py
+++ b/config.py
@@ -56,7 +56,7 @@ class BaseConfig(object):
 
				 2. 用20个字以内对文本内容进行概况。
			
 
				 3. 为文本取一个易于分享，吸引人要求的标题。
			
 
				 4. 列举三个关键词。
			
 
				-以json格式返回，key为category, confidence, summery, title, keywords。分别代表类别，分类置信度，概要，标题，关键词。
			
 
				+以json格式返回，key为category, confidence, summary, title, keywords。分别代表类别，分类置信度，概要，标题，关键词。
			
 
				 -----------------------------
			
 
				 """,
			
 
				             'prompt3': f"""
			
@@ -72,6 +72,17 @@ class BaseConfig(object):
 
				 以json array格式返回,{format_json_array},key为category与confidence，分别代表类别与分类置信度。给出top 3的分类结果。
			
 
				 -----------------------------
			
 
				     """,
			
 
				+            'prompt6': f"""
			
 
				+请对如下文本进行：
			
 
				+1. 用20个字以内对文本内容进行概况。
			
 
				+2. 列举三个关键词。
			
 
				+仅以json格式返回，key为summary, keywords。分别代表概要，关键词。
			
 
				+-----------------------------
			
 
				+""",
			
 
				+            'prompt7': f"""请根据以下的视频信息对其进行分类。类别为其中的一个：【{' '.join(TAGS_NEW)}】。
			
 
				+仅以json array格式返回,{format_json_array},key为category与confidence，分别代表类别与分类置信度。给出top 3的分类结果。
			
 
				+-----------------------------
			
 
				+""",
			
 
				         },
			
 
				         'title': {
			
 
				             'prompt1': f"""
			
--- a/temporary_process.py
+++ b/temporary_process.py
@@ -0,0 +1,151 @@
 
				+import json
			
 
				+import traceback
			
 
				+
			
 
				+from feishu import FeiShuHelper
			
 
				+from audio_process import get_wav
			
 
				+from xunfei_asr import call_asr
			
 
				+from utils import download_video, asr_validity_discrimination
			
 
				+from gpt_tag import get_tag, request_gpt
			
 
				+from config import set_config
			
 
				+from log import Log
			
 
				+config_ = set_config()
			
 
				+log_ = Log()
			
 
				+
			
 
				+
			
 
				+def main(sheet_info_config):
			
 
				+    video_spreadsheet_token = sheet_info_config['video_spreadsheet_token']
			
 
				+    video_sheet_id = sheet_info_config['video_sheet_id']
			
 
				+    read_start_row = sheet_info_config['read_start_row']
			
 
				+    res_spreadsheet_token = sheet_info_config['res_spreadsheet_token']
			
 
				+    res_sheet_id = sheet_info_config['res_sheet_id']
			
 
				+    write_start_row = sheet_info_config['write_start_row']
			
 
				+    write_start_col = sheet_info_config['write_start_col']
			
 
				+    write_end_col = sheet_info_config['write_end_col']
			
 
				+
			
 
				+    # 1. 读取飞书表格，获取视频asr_res和title
			
 
				+    feishu_helper = FeiShuHelper()
			
 
				+    data = feishu_helper.get_data(spreadsheet_token=video_spreadsheet_token, sheet_id=video_sheet_id)
			
 
				+    videos = []
			
 
				+    for item in data[read_start_row:]:
			
 
				+        # print(item)
			
 
				+        # print({'videoId': item[0], 'title': item[3], 'asrRes': item[2]})
			
 
				+        try:
			
 
				+            videos.append(
			
 
				+                {
			
 
				+                    'videoId': item[0],
			
 
				+                    'title': item[3],
			
 
				+                    'asrRes': item[2],
			
 
				+                }
			
 
				+            )
			
 
				+        except:
			
 
				+            continue
			
 
				+    log_.info(f"videos count: {len(videos)}")
			
 
				+
			
 
				+    result = []
			
 
				+    for i, video in enumerate(videos):
			
 
				+        try:
			
 
				+            log_.info(f"i = {i}, video = {video}")
			
 
				+            asr_res_initial = video['asrRes']
			
 
				+            title = video['title']
			
 
				+            # 2. 判断asr识别的文本是否有效
			
 
				+            validity = asr_validity_discrimination(text=asr_res_initial)
			
 
				+            log_.info(f"validity = {validity}")
			
 
				+            if validity is True:
			
 
				+                # 3. 对asr结果进行清洗
			
 
				+                asr_res = asr_res_initial.strip().replace('\n', '')
			
 
				+                for stop_word in config_.STOP_WORDS:
			
 
				+                    asr_res = asr_res.replace(stop_word, '')
			
 
				+                # token限制: 字数 <= 2500
			
 
				+                asr_res = asr_res[-2500:]
			
 
				+
			
 
				+                # 4. gpt产出结果
			
 
				+                # 4.1 gpt产出summary, keywords,
			
 
				+                prompt1 = f"{config_.GPT_PROMPT['tags']['prompt6']}{asr_res.strip()}"
			
 
				+                # gpt_res = get_tag(prompt=prompt)
			
 
				+                gpt_res1 = request_gpt(prompt=prompt1)
			
 
				+                # print(gpt_res)
			
 
				+                log_.info(f"gpt_res1 = {gpt_res1}, type = {type(gpt_res1)}")
			
 
				+                if gpt_res1 is None:
			
 
				+                    result = [[str(validity), prompt1, '', '', '', '', '']]
			
 
				+                else:
			
 
				+                    result = [[str(validity), prompt1, gpt_res1]]
			
 
				+                    # 4.2 获取summary, keywords, title进行分类
			
 
				+                    try:
			
 
				+                        gpt_res1_json = json.loads(gpt_res1)
			
 
				+                        summary = gpt_res1_json['summary']
			
 
				+                        keywords = gpt_res1_json['keywords']
			
 
				+                        result[0].extend([summary, str(keywords)])
			
 
				+                        prompt2_param = f"标题：{title}\n概况：{summary}\n关键词：{keywords}"
			
 
				+                        prompt2 = f"{config_.GPT_PROMPT['tags']['prompt7']}{prompt2_param}"
			
 
				+                        log_.info(f"prompt2: {prompt2}")
			
 
				+                        gpt_res2 = request_gpt(prompt=prompt2)
			
 
				+                        log_.info(f"gpt_res2 = {gpt_res2}, type = {type(gpt_res2)}")
			
 
				+
			
 
				+                        # 5. 结果写入飞书表格
			
 
				+                        if gpt_res2 is None:
			
 
				+                            result[0].extend(['', '', ''])
			
 
				+                        else:
			
 
				+                            confidence_up_list = []
			
 
				+                            try:
			
 
				+                                for item in json.loads(gpt_res2):
			
 
				+                                    if item['confidence'] > 0.5:
			
 
				+                                        confidence_up_list.append(item['category'])
			
 
				+                            except:
			
 
				+                                pass
			
 
				+                            confidence_up = ', '.join(confidence_up_list)
			
 
				+                            result[0].extend([prompt2, gpt_res2, confidence_up])
			
 
				+                    except:
			
 
				+                        result[0].extend(['', '', '', '', ''])
			
 
				+            else:
			
 
				+                result = [[str(validity), '', '', '', '', '', '', '']]
			
 
				+            log_.info(f"result = {result}")
			
 
				+            if len(result) > 0:
			
 
				+                feishu_helper.update_values(
			
 
				+                    sheet_token=res_spreadsheet_token,
			
 
				+                    sheet_id=res_sheet_id,
			
 
				+                    data=result,
			
 
				+                    start_row=write_start_row,
			
 
				+                    start_column=write_start_col,
			
 
				+                    end_column=write_end_col
			
 
				+                )
			
 
				+                log_.info(f"write to feishu success!")
			
 
				+                write_start_row += 1
			
 
				+        except Exception as e:
			
 
				+            log_.error(e)
			
 
				+            log_.error(traceback.format_exc())
			
 
				+            continue
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    sheet_info = {
			
 
				+        'top100新promt-0605': {
			
 
				+            'video_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
			
 
				+            'video_sheet_id': 'tbd971',
			
 
				+            'read_start_row': 1,
			
 
				+            'res_spreadsheet_token': 'DkiUsqwJ6hmBxstBYyEcNE4ante',
			
 
				+            'res_sheet_id': 'tbd971',
			
 
				+            'write_start_row': 2,
			
 
				+            'write_start_col': 'E',
			
 
				+            'write_end_col': 'L'
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for sheet_tag, sheet_item in sheet_info.items():
			
 
				+        print(sheet_tag)
			
 
				+        main(sheet_info_config=sheet_item)
			
 
				+
			
 
				+    # video_path = download_video(
			
 
				+    #     video_url='http://rescdn.yishihui.com/longvideo/video/vpc/20230420/22421791F3yZJNHSelDuvs04zd',
			
 
				+    #     video_id='001', download_folder='videos', ftype='mp4')
			
 
				+    # print(video_path)
			
 
				+    # # 3. 获取视频中的音频
			
 
				+    # audio_path = get_wav(video_path=video_path)
			
 
				+    # print(audio_path)
			
 
				+    # log_.info(f"audio_path = {audio_path}")
			
 
				+    # # 4. asr
			
 
				+    # asr_res = call_asr(audio_path=audio_path)
			
 
				+    # print(asr_res)
			
 
				+    # log_.info(f"asr_res = {asr_res}")
			
 
				+    # # 5. gpt产出结果
			
 
				+    # gpt_res = get_tag(text=asr_res)
			
 
				+    # print(gpt_res)