algorithm
/
RAG


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
							import json

from core.database import DBHelper
from data_models.content_chunks import ContentChunks
from data_models.keyword_clustering import KeywordClustering
from data_models.keyword_data import KeywordData
from data_models.keyword_with_content_chunk import KeywordWithContentChunk


def query_keyword_data(keywords, db_helper):
    """获取关键词数据，避免重复查询"""
    if not keywords:
        return {}

    # 一次性查询所有关键词数据
    keyword_datas = db_helper.get_all(KeywordData, keyword__in=keywords)
    return {data.keyword: data for data in keyword_datas}


def query_keyword_summary_results(keywords):
    """通过关键词搜索，获取与问题相关的内容总结

    Args:
        keywords: 关键词列表["关键词1"，"关键词2",...]

    Returns:
        list: 搜索结果，包含关键词和对应的总结
    """
    if not keywords:
        return []

    res = []
    db_helper = DBHelper()

    try:
        # 一次性获取所有关键词数据
        keyword_dict = query_keyword_data(keywords, db_helper)

        # 获取所有关键词ID
        keyword_ids = [data.id for data in keyword_dict.values()]

        if not keyword_ids:
            return res

        # 一次性查询所有关键词聚类数据
        clustering_data = db_helper.get_all(
            KeywordClustering,
            keyword_id__in=keyword_ids
        )

        # 构建关键字ID到聚类数据的映射
        clustering_map = {data.keyword_id: data for data in clustering_data}

        # 构建结果
        for keyword in keywords:
            if keyword in keyword_dict:
                keyword_id = keyword_dict[keyword].id
                if keyword_id in clustering_map:
                    res.append({
                        'keyword': keyword,
                        'keyword_summary': clustering_map[keyword_id].keyword_summary
                    })

    except Exception as e:
        # 记录日志或处理异常
        print(f"查询关键词总结时出错: {str(e)}")

    return res


def query_keyword_content_results(keywords):
    """通过关键词搜索，获取与问题相关的内容

    Args:
        keywords: 关键词列表["关键词1"，"关键词2",...]

    Returns:
        list: 搜索结果，包含关键词、内容和内容总结
    """
    if not keywords:
        return []

    res = []
    db_helper = DBHelper()

    try:
        # 一次性获取所有关键词数据
        keyword_dict = query_keyword_data(keywords, db_helper)

        # 获取所有关键词ID
        keyword_ids = [data.id for data in keyword_dict.values()]

        if not keyword_ids:
            return res

        # 一次性查询所有关键词与内容的关联
        keyword_content_relations = db_helper.get_all(
            KeywordWithContentChunk,
            keyword_id__in=keyword_ids
        )

        # 获取所有内容ID
        content_chunk_ids = [relation.content_chunk_id for relation in keyword_content_relations]

        if not content_chunk_ids:
            return res

        # 一次性查询所有内容数据
        content_data_list = db_helper.get_all(
            ContentChunks,
            id__in=content_chunk_ids
        )

        # 构建内容ID到内容数据的映射
        content_map = {data.id: data for data in content_data_list}

        # 构建关键字ID到关键词的映射
        keyword_id_to_word = {data.id: data.keyword for data in keyword_dict.values()}

        # 按关键词分组内容数据
        keyword_to_content = {}

        for relation in keyword_content_relations:
            if relation.content_chunk_id in content_map:
                content_data = content_map[relation.content_chunk_id]
                keyword = keyword_id_to_word.get(relation.keyword_id, '未知关键词')
                if keyword not in keyword_to_content:
                    keyword_to_content[keyword] = []
                keyword_to_content[keyword].append(content_data)

        # 对每个关键词的内容按ID倒排并取前5条
        for keyword, content_list in keyword_to_content.items():
            # 根据 ID 倒排，取前 5 条记录
            sorted_content_list = sorted(content_list, key=lambda x: x.id, reverse=True)[:5]

            for content_data in sorted_content_list:
                res.append({
                    'keyword': keyword,
                    'content': content_data.text,
                    'content_summary': content_data.summary
                })

    except Exception as e:
        # 记录日志或处理异常
        print(f"查询关键词内容时出错: {str(e)}")

    return res


if __name__ == '__main__':
    print(json.dumps(query_keyword_content_results(['医疗AI', 'Lora模型']), ensure_ascii=False))


#
# def query_embedding_results(query, top_k=5, better_than_threshold=0.65):
#     graphvectorizer = GraphVectorizer()
#     return graphvectorizer.embedding_search_entity(query, top_k=top_k, better_than_threshold=better_than_threshold)