2 ماه پیش · 05c73f5711
--- a/api/search.py
+++ b/api/search.py
@@ -10,6 +10,7 @@ from schemas.schemas import Query, ContentData
 
				 from tools_v1 import query_keyword_summary_results, query_keyword_content_results
			
 
				 from utils.data_utils import add_data
			
 
				 from utils.deepseek_utils import get_keywords
			
 
				+from utils.embedding_utils import get_embedding_content_data
			
 
				 
			
 
				 router = APIRouter()
			
 
				 
			
@@ -19,11 +20,13 @@ executor = ThreadPoolExecutor(max_workers=10)
 
				 
			
 
				 @router.post("/query", response_model=ResponseWrapper)
			
 
				 async def query_keyword(query: Query):
			
 
				+    print(query.text)
			
 
				     keywords = get_keywords(query.text)['keywords']
			
 
				     print(keywords)
			
 
				     summary_res = query_keyword_summary_results(keywords)
			
 
				     content_res = query_keyword_content_results(keywords)
			
 
				-    res = {'summary_results': summary_res, 'content_results': content_res}
			
 
				+    embedding_res = get_embedding_content_data(query.text)
			
 
				+    res = {'summary_results': summary_res, 'content_results': content_res, 'embedding_results': embedding_res}
			
 
				     return ResponseWrapper(
			
 
				         status_code=200,
			
 
				         detail="success",
			
--- a/tools_v1.py
+++ b/tools_v1.py
@@ -117,12 +117,25 @@ def query_keyword_content_results(keywords):
 
				         # 构建关键字ID到关键词的映射
			
 
				         keyword_id_to_word = {data.id: data.keyword for data in keyword_dict.values()}
			
 
				 
			
 
				-        # 构建结果
			
 
				+        # 按关键词分组内容数据
			
 
				+        keyword_to_content = {}
			
 
				+
			
 
				         for relation in keyword_content_relations:
			
 
				             if relation.content_chunk_id in content_map:
			
 
				                 content_data = content_map[relation.content_chunk_id]
			
 
				+                keyword = keyword_id_to_word.get(relation.keyword_id, '未知关键词')
			
 
				+                if keyword not in keyword_to_content:
			
 
				+                    keyword_to_content[keyword] = []
			
 
				+                keyword_to_content[keyword].append(content_data)
			
 
				+
			
 
				+        # 对每个关键词的内容按ID倒排并取前5条
			
 
				+        for keyword, content_list in keyword_to_content.items():
			
 
				+            # 根据 ID 倒排，取前 5 条记录
			
 
				+            sorted_content_list = sorted(content_list, key=lambda x: x.id, reverse=True)[:5]
			
 
				+
			
 
				+            for content_data in sorted_content_list:
			
 
				                 res.append({
			
 
				-                    'keyword': keyword_id_to_word.get(relation.keyword_id, '未知关键词'),
			
 
				+                    'keyword': keyword,
			
 
				                     'content': content_data.text,
			
 
				                     'content_summary': content_data.summary
			
 
				                 })
			
--- a/utils/deepseek_utils.py
+++ b/utils/deepseek_utils.py
@@ -99,7 +99,7 @@ def text_question(text_to_question: str):
 
				 def create_keyword_summary_prompt(text, keyword):
			
 
				     prompt = f"""
			
 
				     
			
 
				-    请基于以下关于关键词"{keyword}"的多条知识，生成一段全面、准确且连贯的知识。
			
 
				+    请基于以下关于关键词"{keyword}"的多条知识，生成一段全面、准确且连贯的知识，不要输出与知识无关的内容，只返回关键词知识内容。
			
 
				 
			
 
				 ## 描述内容：
			
 
				 {text}
			
@@ -128,7 +128,7 @@ def get_keyword_summary(text, keyword):
 
				 def update_keyword_summary_prompt(text, keyword, new_content):
			
 
				     prompt = f"""
			
 
				 
			
 
				-    请基于以下关于关键词"{keyword}"的相关知识，融合最新的知识到现有的知识中。
			
 
				+    请基于以下关于关键词"{keyword}"的相关知识，融合最新的知识到现有的知识中，不要输出与知识无关的内容，只返回关键词知识内容。
			
 
				 
			
 
				 ## 知识要求：
			
 
				 1. 识别重叠与重复：找出不同文本中表述不同但含义相同的内容。
			
@@ -169,7 +169,7 @@ def create_keyword_prompt(text):
 
				         str: 格式化后的 prompt
			
 
				     """
			
 
				     prompt = f"""
			
 
				-提取最能代表当前分析范围（整体或段落）核心内容的关键词或短语。避免使用过于通用和宽泛的词汇
			
 
				+提取最能代表当前分析范围（整体或段落）核心内容的关键词或短语，如果本身就是一个词，直接返回这个词。避免使用过于通用和宽泛的词汇,
			
 
				 ## 描述内容：
			
 
				 {text}
			
 
				 
			
@@ -225,4 +225,4 @@ def chat_with_deepseek(prompt, model="deepseek-chat", max_tokens=8192, temperatu
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    print(chat_with_deepseek('你好啊'))
			
 
				+    print(get_keyword_summary('这样去发布你的视频，才能增加播放量！ #新人如何做抖音 #短视频创业 #自媒体创业 #抖音创业','播放量'))
			
--- a/utils/embedding_utils.py
+++ b/utils/embedding_utils.py
@@ -0,0 +1,39 @@
 
				+import json
			
 
				+
			
 
				+import requests
			
 
				+from core.config import logger
			
 
				+from core.database import DBHelper
			
 
				+from data_models.content_chunks import ContentChunks
			
 
				+
			
 
				+
			
 
				+def get_embedding_data(query):
			
 
				+    try:
			
 
				+        response = requests.post(
			
 
				+            url='http://192.168.100.31:8001/api/search',
			
 
				+            json={
			
 
				+                "query": query,
			
 
				+                "search_type": "by_vector",
			
 
				+                "limit": 5},
			
 
				+            headers={"Content-Type": "application/json"},
			
 
				+        )
			
 
				+        return response.json()['results']
			
 
				+    except Exception as e:
			
 
				+        logger.error(e)
			
 
				+    return []
			
 
				+
			
 
				+
			
 
				+def get_embedding_content_data(query):
			
 
				+    res = []
			
 
				+    db_helper = DBHelper()
			
 
				+    results = get_embedding_data(query)
			
 
				+    if results:
			
 
				+        for result in results:
			
 
				+            content_chunk = db_helper.get(ContentChunks, doc_id=result['doc_id'], chunk_id=result['chunk_id'])
			
 
				+            res.append(
			
 
				+                {'content': content_chunk.text, 'content_summary': content_chunk.summary, 'score': result['score']})
			
 
				+    return res
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    results = get_embedding_content_data("AI绘图工具")
			
 
				+    print(json.dumps(results, ensure_ascii=False))
			
--- a/utils/keywords_utils.py
+++ b/utils/keywords_utils.py
@@ -10,11 +10,12 @@ from data_models.content_chunks import ContentChunks
 
				 from data_models.keyword_clustering import KeywordClustering
			
 
				 from data_models.keyword_data import KeywordData
			
 
				 from data_models.keyword_with_content_chunk import KeywordWithContentChunk
			
 
				-from utils.deepseek_utils import get_keyword_summary, update_keyword_summary_prompt
			
 
				+from utils.deepseek_utils import get_keyword_summary, update_keyword_summary
			
 
				 
			
 
				 
			
 
				 class KeywordSummaryTask:
			
 
				     lock_dict = {}  # 静态变量，不会随着每次实例化而重置
			
 
				+
			
 
				     def __init__(self):
			
 
				         self.executor = ThreadPoolExecutor(max_workers=20, thread_name_prefix='KeywordSummaryTask')
			
 
				 
			
@@ -56,9 +57,9 @@ class KeywordSummaryTask:
 
				                                                                            'keyword_summary'])
			
 
				                             db_helper.add(new_keyword_clustering)
			
 
				                         else:
			
 
				-                            new_keyword_summary = update_keyword_summary_prompt(keyword_clustering.keyword_summary,
			
 
				-                                                                                keyword,
			
 
				-                                                                                content_chunk.text)
			
 
				+                            new_keyword_summary = update_keyword_summary(keyword_clustering.keyword_summary,
			
 
				+                                                                         keyword,
			
 
				+                                                                         content_chunk.text)
			
 
				                             db_helper.update(KeywordClustering, filters={"id": keyword_clustering.id},
			
 
				                                              updates={"keyword_summary": new_keyword_summary})
			
 
				                         db_helper.update(KeywordWithContentChunk, filters={"id": keyword_with_content_chunk.id},