| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- import json
- from core.database import DBHelper
- from data_models.content_chunks import ContentChunks
- from data_models.keyword_clustering import KeywordClustering
- from data_models.keyword_data import KeywordData
- from data_models.keyword_with_content_chunk import KeywordWithContentChunk
- def query_keyword_data(keywords, db_helper):
- """获取关键词数据,避免重复查询"""
- if not keywords:
- return {}
- # 一次性查询所有关键词数据
- keyword_datas = db_helper.get_all(KeywordData, keyword__in=keywords)
- return {data.keyword: data for data in keyword_datas}
- def query_keyword_summary_results(keywords):
- """通过关键词搜索,获取与问题相关的内容总结
- Args:
- keywords: 关键词列表["关键词1","关键词2",...]
- Returns:
- list: 搜索结果,包含关键词和对应的总结
- """
- if not keywords:
- return []
- res = []
- db_helper = DBHelper()
- try:
- # 一次性获取所有关键词数据
- keyword_dict = query_keyword_data(keywords, db_helper)
- # 获取所有关键词ID
- keyword_ids = [data.id for data in keyword_dict.values()]
- if not keyword_ids:
- return res
- # 一次性查询所有关键词聚类数据
- clustering_data = db_helper.get_all(
- KeywordClustering,
- keyword_id__in=keyword_ids
- )
- # 构建关键字ID到聚类数据的映射
- clustering_map = {data.keyword_id: data for data in clustering_data}
- # 构建结果
- for keyword in keywords:
- if keyword in keyword_dict:
- keyword_id = keyword_dict[keyword].id
- if keyword_id in clustering_map:
- res.append({
- 'keyword': keyword,
- 'keyword_summary': clustering_map[keyword_id].keyword_summary
- })
- except Exception as e:
- # 记录日志或处理异常
- print(f"查询关键词总结时出错: {str(e)}")
- return res
- def query_keyword_content_results(keywords):
- """通过关键词搜索,获取与问题相关的内容
- Args:
- keywords: 关键词列表["关键词1","关键词2",...]
- Returns:
- list: 搜索结果,包含关键词、内容和内容总结
- """
- if not keywords:
- return []
- res = []
- db_helper = DBHelper()
- try:
- # 一次性获取所有关键词数据
- keyword_dict = query_keyword_data(keywords, db_helper)
- # 获取所有关键词ID
- keyword_ids = [data.id for data in keyword_dict.values()]
- if not keyword_ids:
- return res
- # 一次性查询所有关键词与内容的关联
- keyword_content_relations = db_helper.get_all(
- KeywordWithContentChunk,
- keyword_id__in=keyword_ids
- )
- # 获取所有内容ID
- content_chunk_ids = [relation.content_chunk_id for relation in keyword_content_relations]
- if not content_chunk_ids:
- return res
- # 一次性查询所有内容数据
- content_data_list = db_helper.get_all(
- ContentChunks,
- id__in=content_chunk_ids
- )
- # 构建内容ID到内容数据的映射
- content_map = {data.id: data for data in content_data_list}
- # 构建关键字ID到关键词的映射
- keyword_id_to_word = {data.id: data.keyword for data in keyword_dict.values()}
- # 按关键词分组内容数据
- keyword_to_content = {}
- for relation in keyword_content_relations:
- if relation.content_chunk_id in content_map:
- content_data = content_map[relation.content_chunk_id]
- keyword = keyword_id_to_word.get(relation.keyword_id, '未知关键词')
- if keyword not in keyword_to_content:
- keyword_to_content[keyword] = []
- keyword_to_content[keyword].append(content_data)
- # 对每个关键词的内容按ID倒排并取前5条
- for keyword, content_list in keyword_to_content.items():
- # 根据 ID 倒排,取前 5 条记录
- sorted_content_list = sorted(content_list, key=lambda x: x.id, reverse=True)[:5]
- for content_data in sorted_content_list:
- res.append({
- 'keyword': keyword,
- 'content': content_data.text,
- 'content_summary': content_data.summary
- })
- except Exception as e:
- # 记录日志或处理异常
- print(f"查询关键词内容时出错: {str(e)}")
- return res
- if __name__ == '__main__':
- print(json.dumps(query_keyword_content_results(['医疗AI', 'Lora模型']), ensure_ascii=False))
- #
- # def query_embedding_results(query, top_k=5, better_than_threshold=0.65):
- # graphvectorizer = GraphVectorizer()
- # return graphvectorizer.embedding_search_entity(query, top_k=top_k, better_than_threshold=better_than_threshold)
|