|
@@ -0,0 +1,150 @@
|
|
|
+import concurrent
|
|
|
+import json
|
|
|
+import threading
|
|
|
+
|
|
|
+from core.database import DBHelper
|
|
|
+from data_models.content_data import ContentData
|
|
|
+from data_models.keyword_clustering import KeywordClustering
|
|
|
+from data_models.keyword_data import KeywordData
|
|
|
+from data_models.keyword_with_content import KeywordWithContent
|
|
|
+from utils.deepseek_utils import text_segment, text_question, create_keyword_summary_prompt, get_keyword_summary, \
|
|
|
+ update_keyword_summary_prompt
|
|
|
+
|
|
|
+
|
|
|
+def is_empty(value):
|
|
|
+ """辅助函数:判断值是否为空(None 或空字符串)"""
|
|
|
+ return value is None or value == ""
|
|
|
+
|
|
|
+
|
|
|
+def parse_json(file_path):
|
|
|
+ try:
|
|
|
+ # 读取文件内容
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
+ try:
|
|
|
+ # 解析JSON内容
|
|
|
+ json_data = json.load(file)
|
|
|
+ # 检查是否为JSON数组
|
|
|
+ if isinstance(json_data, list):
|
|
|
+ return json_data
|
|
|
+ # # 遍历每个JSON对象
|
|
|
+ # for index, json_obj in enumerate(json_data, 1):
|
|
|
+ # body_text = json_obj.get("body_text", "") # 字段不存在 → 空字符串
|
|
|
+ # if not is_empty(body_text):
|
|
|
+ # text_list.append(body_text)
|
|
|
+ else:
|
|
|
+ print("错误: 文件内容不是一个JSON数组")
|
|
|
+
|
|
|
+ except json.JSONDecodeError as e:
|
|
|
+ print(f"JSON解析错误: {e}")
|
|
|
+ except FileNotFoundError:
|
|
|
+ print(f"错误: 找不到文件 '{file_path}'")
|
|
|
+ except Exception as e:
|
|
|
+ print(f"发生错误: {e}")
|
|
|
+ return []
|
|
|
+
|
|
|
+
|
|
|
+def generate_keywords(keyword, content_data):
|
|
|
+ db_helper = DBHelper()
|
|
|
+ keyword_data = db_helper.get(KeywordData, keyword=keyword)
|
|
|
+ if keyword_data is None:
|
|
|
+ new_keyword_data = KeywordData(keyword=keyword)
|
|
|
+ keyword_data = db_helper.add(new_keyword_data)
|
|
|
+ keyword_with_content = KeywordWithContent(keyword_id=keyword_data.id, content_id=content_data.id)
|
|
|
+ db_helper.add(keyword_with_content)
|
|
|
+ keyword_clustering = db_helper.get(KeywordClustering, keyword_id=keyword_data.id)
|
|
|
+ if keyword_clustering is None:
|
|
|
+ keyword_summary = get_keyword_summary(content_data.content, keyword_data.keyword)
|
|
|
+ new_keyword_clustering = KeywordClustering(keyword_id=keyword_data.id,
|
|
|
+ keyword_summary=keyword_summary['keyword_summary'])
|
|
|
+ db_helper.add(new_keyword_clustering)
|
|
|
+ print(new_keyword_clustering)
|
|
|
+ else:
|
|
|
+ new_keyword_summary = update_keyword_summary_prompt(keyword_clustering.keyword_summary, keyword,
|
|
|
+ content_data.content)
|
|
|
+ db_helper.update(KeywordClustering, filters={"id": keyword_clustering.id},
|
|
|
+ updates={"keyword_summary": new_keyword_summary})
|
|
|
+
|
|
|
+
|
|
|
+# 划分7元组
|
|
|
+def ai_dispose(text):
|
|
|
+ db_helper = DBHelper()
|
|
|
+ segments = text_segment(text)['segments']
|
|
|
+ segment_pre_content_id = None
|
|
|
+ for segment in segments:
|
|
|
+ text = ''
|
|
|
+ content = segment['content']
|
|
|
+ summary = segment['summary']
|
|
|
+ keywords = segment['keywords']
|
|
|
+ if not is_empty(content) and not is_empty(summary):
|
|
|
+ # 两个都不为空:拼接(中间加空格,可按需改为空字符串 "")
|
|
|
+ text = f"{content}。{summary}"
|
|
|
+ elif not is_empty(summary):
|
|
|
+ # 仅 title 不为空:返回 title
|
|
|
+ text = content
|
|
|
+ elif not is_empty(content):
|
|
|
+ # 仅 body_text 不为空:返回 body_text
|
|
|
+ text = summary
|
|
|
+ questions = text_question(text)['questions']
|
|
|
+ content_data = ContentData(pre_content_id=segment_pre_content_id,
|
|
|
+ content=content,
|
|
|
+ summary=summary,
|
|
|
+ keywords=json.dumps(keywords, ensure_ascii=False),
|
|
|
+ entities=json.dumps(segment['entities'], ensure_ascii=False),
|
|
|
+ questions=json.dumps(questions, ensure_ascii=False),
|
|
|
+ keywords_status=0)
|
|
|
+ db_helper.add(content_data)
|
|
|
+ segment_pre_content_id = content_data.id
|
|
|
+ for keyword in keywords:
|
|
|
+ generate_keywords(keyword, content_data)
|
|
|
+ db_helper.update(ContentData, filters={"id": content_data.id}, updates={"keywords_status": 1})
|
|
|
+
|
|
|
+
|
|
|
+print_lock = threading.Lock()
|
|
|
+
|
|
|
+
|
|
|
+def process_text(text):
|
|
|
+ """处理单个文本的函数"""
|
|
|
+ try:
|
|
|
+ # 使用锁确保打印不会交叉
|
|
|
+ with print_lock:
|
|
|
+ print(f"处理文本: {text[:50]}...") # 只打印前50个字符避免过长
|
|
|
+
|
|
|
+ # 调用原来的处理函数
|
|
|
+ result = ai_dispose(text)
|
|
|
+
|
|
|
+ return result
|
|
|
+ except Exception as e:
|
|
|
+ with print_lock:
|
|
|
+ print(f"处理文本时出错: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+# 使用线程池处理文本列表
|
|
|
+def process_texts_concurrently(text_list, max_workers=20):
|
|
|
+ """使用多线程并发处理文本列表"""
|
|
|
+ results = []
|
|
|
+
|
|
|
+ # 创建线程池执行器
|
|
|
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
+ # 提交所有任务到线程池
|
|
|
+ future_to_text = {executor.submit(process_text, text['body_text']): text for text in text_list}
|
|
|
+
|
|
|
+ # 处理完成的任务
|
|
|
+ for future in concurrent.futures.as_completed(future_to_text):
|
|
|
+ text = future_to_text[future]
|
|
|
+ try:
|
|
|
+ result = future.result()
|
|
|
+ results.append(result)
|
|
|
+ with print_lock:
|
|
|
+ print(f"成功处理文本: {text[:30]}...")
|
|
|
+ except Exception as e:
|
|
|
+ with print_lock:
|
|
|
+ print(f"处理文本时发生异常: {e}")
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ json_path = '../data/test_data1.json'
|
|
|
+ text_list = parse_json(json_path)
|
|
|
+ process_texts_concurrently(text_list)
|