Explorar o código

添加对结构结果 what,pattern,how的存储,索引的支持

lookathis@163.com hai 1 mes
pai
achega
e836747eee

+ 312 - 0
deconstruct_SQI/milvus_deconstruct_insert.py

@@ -0,0 +1,312 @@
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+
+from pydub import AudioSegment
+import io
+from scipy.io import wavfile
+
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+##################################引入多模态模型#################
+import torch
+from PIL import Image
+from transformers.utils.import_utils import is_flash_attn_2_available
+
+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
+
+model = ColQwen2_5Omni.from_pretrained(
+    "vidore/colqwen-omni-v0.1",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda",  # or "mps" if on Apple Silicon
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
+).eval()
+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
+##################################引入模型#################
+
+################################连接Embedding service A
+# 注意:根据之前的讨论,需要通过SSH隧道将远程服务转发到本地
+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
+
+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
+    """通过HTTP调用在线embedding服务"""
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model,
+        "input": text
+    }
+    
+    response = requests.post(
+        VLLM_SERVER_URL,
+        headers=headers,
+        json=data,
+        timeout=5  # 添加超时设置
+    )
+    response.raise_for_status()  # 如果状态码不是200,抛出异常
+    result = response.json()
+    return result["data"][0]["embedding"]
+
+def get_media_embedding(query: str, type: str):
+    '''
+    query 是查询字符串或文件路径
+    type 是查询类型,可选值为 "audio", "image", "video", "text"
+    k 是返回的结果数量,默认值为 3
+    audio image video 的query为路径
+    text的query为问题本身
+    '''
+    if type =="audio":
+        batch_queries = processor.process_audios([query]).to(model.device)
+
+    elif type =="image":
+        query_image = Image.open(query)
+        batch_queries = processor.process_images([query_image]).to(model.device)
+    elif type =="video":
+        batch_queries = processor.process_videos([query]).to(model.device)   
+    elif type =="text":
+        batch_queries = processor.process_queries([query]).to(model.device)
+    # Forward pass
+    with torch.no_grad():
+        query_embeddings = model(**batch_queries)
+    return query_embeddings
+    # # scores = processor.score_multi_vector(query_embeddings, ds)
+    # print("score is ", scores)
+    # # get top-5 scores
+    # return scores[0].topk(k).indices.tolist()
+
+# ################################连接Embedding service B
+
+def parse_deconstruct_res(json_data) -> Dict[str, Dict[str, str]]:
+    """
+    解析 deconstruct_res.json 文件,提取两类信息:
+    1. 所有 "what" 字段的 path 与 value 映射
+    2. 所有类型为 "image" 或 "video" 的媒体引用 path 与 content 值映射
+    
+    返回:
+        {
+            "what": {path: value, ...},
+            "media": {path: value, ...}
+        }
+    """
+    data = json_data
+    what_dict: Dict[str, Any] = {}
+    media_dict: Dict[str, Any] = {}
+
+    def traverse(obj: Any, current_path: str = ""):
+        """递归遍历 JSON 结构,记录目标字段"""
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                # 构建新路径,避免在开头添加点号
+                new_path = f"{current_path}.{k}" if current_path else k
+                
+                if k == "what":
+                    what_dict[new_path] = v
+                # 处理媒体引用字段
+                elif k == "媒体引用" and isinstance(v, list):
+                    # 遍历媒体引用数组
+                    for idx, media_item in enumerate(v):
+                        if isinstance(media_item, dict) and media_item.get("type") in ("image", "video", "audio"):
+                            # 记录content字段作为媒体路径
+                            content = media_item.get("content")
+                            type_nm = media_item.get("type")
+                            if content:
+                                # 生成正确格式的路径,如"图片元素[5].媒体引用[0].content"
+                                media_ref_path = f"{type_nm}-{new_path}[{idx}].content"
+                                media_dict[media_ref_path] = content
+                
+                # 继续递归遍历
+                traverse(v, new_path)
+        
+        elif isinstance(obj, list):
+            for idx, item in enumerate(obj):
+                # 对于数组元素,使用方括号索引
+                new_path = f"{current_path}[{idx}]"
+                traverse(item, new_path)
+
+    traverse(data)
+    return {"what": what_dict, "media": media_dict}
+
+# 使用示例
+if __name__ == "__main__":
+    
+    # 连接 MongoDB 数据库
+    ##################### 存储到mongoDB
+
+    MONGO_URI = "mongodb://localhost:27017/"
+    DB_NAME = "mydeconstruct"
+    COLL_NAME = "deconstruct"
+
+    client = MongoClient(MONGO_URI)
+    db = client[DB_NAME]
+    coll = db[COLL_NAME]
+
+    # 读取并插入 JSON 文件
+    json_path = "/home/ecs-user/project/colpali/src/deconstruct_res.json"
+
+    with open(json_path, "r", encoding="utf-8") as f:
+        doc = json.load(f)
+
+    insert_result = coll.insert_one(doc)
+    inserted_id = insert_result.inserted_id
+    print("已插入 MongoDB,文档 _id:", inserted_id)
+
+    result = parse_deconstruct_res(doc)
+    print("what 字段映射:", result["what"])
+    print("媒体引用映射:", result["media"])
+
+    ##################### 存储到mongoDB
+
+    ##################### 将 result["what"] 中的每个 value 转换为向量并插入 Milvus
+    ########## 文本向量库存一份what
+    # 创建 Milvus 集合(如不存在)
+    collection_name = "deconstruct_what"
+    if not utility.has_collection(collection_name): 
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
+        ]
+        schema = CollectionSchema(fields, description="Deconstruct what embeddings")
+        collection = Collection(name=collection_name, schema=schema)
+        # 创建 IVF_FLAT 索引
+        index_params = {
+            "metric_type": "IP",
+            "index_type": "IVF_FLAT",
+            "params": {"nlist": 128}
+        }
+        collection.create_index("embedding", index_params)
+    else:
+        collection = Collection(name=collection_name)
+
+    # 遍历 result["what"],生成 embeddings 并插入 Milvus
+    entities = []
+    for key, value in result["what"].items():
+        embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
+        path = key
+        entities.append({
+            "mongo_id": str(inserted_id),
+            "path": path,
+            "embedding": embedding
+        })
+
+    if entities:
+        collection.insert(entities)
+        collection.flush()
+        print(f"已插入 {len(entities)} 条 what 字段向量到 Milvus")
+    else:
+        print("未找到 what 字段,未插入向量")
+    ##################### 将 result["what"] 中的每个 value 转换为向量并插入 Milvus
+
+    #####################将 result["media"] 中的每个 value 调用多模态编码模型计算embedding并插入Milvus
+    # 创建 Milvus 集合(如不存在)
+    collection_name = "deconstruct_media"
+    if not utility.has_collection(collection_name):
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
+            FieldSchema(name="no", dtype=DataType.INT32),
+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
+        ]
+        schema = CollectionSchema(fields, description="Deconstruct media embeddings")
+        collection = Collection(name=collection_name, schema=schema)
+        # 创建 IVF_FLAT 索引
+        index_params = {
+            "metric_type": "IP",
+            "index_type": "IVF_FLAT",
+            "params": {"nlist": 128}
+        }
+        collection.create_index("embedding", index_params)
+    else:
+        collection = Collection(name=collection_name)
+        # 遍历 result["media"],生成 embeddings 并插入 Milvus
+    #############存储一份media embedding到Milvus
+    entities = []
+    for key, value in result["media"].items():
+        embedding = get_media_embedding(value, model=DEFAULT_MODEL)
+        type = key[:key.index("-")]
+        path = key[key.index("-"):]
+        # 将 embedding 列表拆分为单条向量,并记录其在原列表中的位置 no
+        if isinstance(embedding, list) and len(embedding) > 0:
+            for idx, vec in enumerate(embedding):
+                entities.append({
+                    "mongo_id": str(inserted_id),
+                    "type": type,
+                    "path": path,
+                    "no": idx,
+                    "embedding": vec
+                })
+        else:
+            # 若 embedding 不是列表或长度为 0,则 no 记为 0
+            entities.append({
+                "mongo_id": str(inserted_id),
+                "type": type,
+                "path": path,
+                "no": 0,
+                "embedding": embedding
+            })
+
+    # 将插入操作移到循环外部,避免重复插入和数据累积
+    if entities:
+        collection.insert(entities)
+        collection.flush()
+        print(f"已插入 {len(entities)} 条 media 字段向量到 Milvus")
+    else:
+        print("未找到有效的 media 字段向量,未插入数据")
+
+    #############存储一份what 多模态embedding 到Milvus
+    entities = []
+    for key, value in result["what"].items():
+        embedding = get_media_embedding(value, model=DEFAULT_MODEL)
+        # type = key[:key.index("-")]
+        # path = key[key.index("-"):]
+        path = key
+        if isinstance(embedding, list) and len(embedding) > 0:
+            for idx, vec in enumerate(embedding):
+                entities.append({
+                    "mongo_id": str(inserted_id),
+                    "type": "text",
+                    "path": path,
+                    "no": idx,
+                    "embedding": vec
+                })
+        else:
+            # 若 embedding 不是列表或长度为 0,则 no 记为 0
+            entities.append({
+                "mongo_id": str(inserted_id),
+                "type": "text",
+                "path": path,
+                "no": 0,
+                "embedding": embedding
+            })
+
+    # 将插入操作移到循环外部,避免重复插入和数据累积
+    if entities:
+        collection.insert(entities)
+        collection.flush()
+        print(f"已插入 {len(entities)} 条 what 多模态向量到 Milvus")
+    else:
+        print("未找到有效的 what 多模态向量,未插入数据")
+    #############存储一份what 多模态embedding 到Milvus
+
+
+
+

+ 114 - 0
deconstruct_SQI/milvus_deconstruct_query.py

@@ -0,0 +1,114 @@
+from ast import Import
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+from bson import ObjectId
+from pydub import AudioSegment
+import io
+from scipy.io import wavfile
+
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+##################### mongoDB
+MONGO_URI = "mongodb://localhost:27017/"
+DB_NAME = "mydeconstruct"
+COLL_NAME = "deconstruct"
+
+client = MongoClient(MONGO_URI)
+db = client[DB_NAME]
+coll = db[COLL_NAME]
+##################### mongoDB
+
+##################### 路径解析返回
+def resolve_mongo_path(mongo_id: str, path: str):
+    """
+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串,
+    从 MongoDB 中定位并返回对应的对象。
+    """
+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
+    if not doc:
+        return None
+
+    # 将路径按 '.' 分割,逐级访问
+    parts = path.split('.')
+    current = doc
+    for part in parts:
+        # 处理数组索引,如 子节点元素[0]
+        if '[' in part and part.endswith(']'):
+            key, idx_str = part.split('[', 1)
+            idx = int(idx_str[:-1])  # 去掉 ']'
+            current = current[key][idx]
+        else:
+            current = current[part]
+    return current
+##################### 路径解析返回
+
+search_mode ="what_search"
+
+if search_mode == "what_search":
+    ##################query what
+    ##################
+    milvus_client = Collection(name="deconstruct_what")
+    milvus_client.load()
+    collection_name = "deconstruct_what"
+    if not utility.has_collection(collection_name):
+        print(f"no collection named {collection_name}")
+    else:
+        # 查询并打印 collection 中的所有记录
+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
+        try:
+            # 使用 query 方法获取所有记录,不设置过滤条件
+            all_records = milvus_client.query(
+                expr="mongo_id >\"10000000\"",  # 空表达式表示查询所有
+                output_fields=["mongo_id","path"],  # 输出所有字段
+                limit=10000  # 设置一个较大的上限,确保能获取全部
+            )
+            print(f"共查询到 {len(all_records)} 条记录:")
+            for record in all_records:
+                print(record)
+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
+                print("定位items:",rec)
+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
+                # print(docres)
+        except Exception as e:
+            print(f"查询失败:{e}")
+    ##############all_records返回存储的每个record, rec返回解析后的对象
+elif search_mode == "media_search":
+    ##################query media
+    ##################
+    milvus_client = Collection(name="deconstruct_media")
+    milvus_client.load()
+    collection_name = "deconstruct_media"
+    if not utility.has_collection(collection_name):
+        print(f"no collection named {collection_name}")
+    else:
+        # 查询并打印 collection 中的所有记录
+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
+        try:
+            # 使用 query 方法获取所有记录,不设置过滤条件
+            all_records = milvus_client.query( 
+                expr="type==\"text\"",  # 空表达式表示查询所有
+                output_fields=["mongo_id","path","type"],  # 输出所有字段
+                limit=100  # 设置一个较大的上限,确保能获取全部
+            )
+            print(f"共查询到 {len(all_records)} 条记录:")
+            for record in all_records:
+                print(record)
+                docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
+                print(docres)
+        except Exception as e:
+            print(f"查询失败:{e}")
+

+ 243 - 0
deconstruct_SQI/milvus_deconstruct_search.py

@@ -0,0 +1,243 @@
+from ast import Import
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+from bson import ObjectId
+from pydub import AudioSegment
+import io, os
+from scipy.io import wavfile
+import numpy
+
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+##################################引入多模态向量模型#################
+import torch
+from PIL import Image
+from transformers.utils.import_utils import is_flash_attn_2_available
+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
+
+model = ColQwen2_5Omni.from_pretrained(
+    "vidore/colqwen-omni-v0.1",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda",  # or "mps" if on Apple Silicon
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
+).eval()
+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
+##################################引入多模态向量模型#################
+
+##################### mongoDB
+MONGO_URI = "mongodb://localhost:27017/"
+DB_NAME = "mydeconstruct"
+COLL_NAME = "deconstruct"
+
+client = MongoClient(MONGO_URI)
+db = client[DB_NAME]
+coll = db[COLL_NAME]
+##################### mongoDB
+
+#####################text embedding serviceS
+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
+
+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
+    """通过HTTP调用在线embedding服务"""
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model,
+        "input": text
+    } 
+    response = requests.post(
+        VLLM_SERVER_URL,
+        headers=headers,
+        json=data,
+        timeout=5  # 添加超时设置
+    )
+    response.raise_for_status()  # 如果状态码不是200,抛出异常
+    result = response.json()
+    return result["data"][0]["embedding"]
+#####################text embedding serviceS
+
+#####################multi vector search
+import numpy as np
+from collections import defaultdict
+from typing import List, Dict, Tuple
+
+###############multi vector search
+def search_topk_multi(
+    collection: Collection,
+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
+    topk: int = 2
+) -> List[Tuple[str, float]]:
+    """
+    对查询向量列表检索,计算每个对象的平均最大相似度,返回 TopK 对象
+    
+    参数:
+        collection: Milvus 集合实例
+        query_vecs: 查询向量列表(每个向量维度需与集合一致)
+        topk: 返回的 top 数量
+    
+    返回:
+        排序后的列表,元素为 (object_id, 平均最大相似度)
+    """
+    # 步骤1:逐个检索查询向量,收集每个对象的最大相似度
+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
+        # 检索当前查询向量
+        search_params = {
+            "metric_type": "IP",
+            "params": {"nprobe": 10}
+        }
+        results = collection.search(
+            data=[q_vec],
+            anns_field="embedding",
+            param=search_params,
+            limit=16384,  # Milvus最大允许的topk值
+            output_fields=["mongo_id", "type", "path"],
+            expr='type == "image"'  # 只检索type为text的记录
+        )
+        # 按 object_id 分组取最大相似度
+        query_object_sim = defaultdict(float)
+        for hit in results[0]:
+            obj_id = hit.entity.get("mongo_id")
+            sim = hit.score
+            if sim > query_object_sim[obj_id]:
+                query_object_sim[obj_id] = sim
+        
+        all_query_results.append(query_object_sim)
+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成,覆盖 {len(query_object_sim)} 个对象")
+    
+    # 步骤2:计算每个对象的平均最大相似度
+    all_object_ids = set()
+    for res in all_query_results:
+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
+    
+    object_avg_sim = {}
+    for obj_id in all_object_ids:
+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
+        object_avg_sim[obj_id] = avg_sim
+    
+    # 步骤3:按平均相似度排序并取 TopK
+    sorted_objects = sorted(
+        object_avg_sim.items(),
+        key=lambda x: x[1],
+        reverse=True
+    )[:topk]
+    return sorted_objects
+##################文本数据库 search
+
+###############single vector search
+def search_topk_single(
+    collection: Collection,
+    query_vec: List[float],  # 查询向量
+    topk: int = 2
+) -> List[dict]:
+    """
+    对单个查询向量检索,计算每个对象的最大相似度,返回 TopK 对象
+    
+    参数:
+        collection: Milvus 集合实例
+        query_vec: 查询向量(维度需与集合一致)
+        topk: 返回的 top 数量
+    
+    返回:
+        排序后的列表,元素为 (object_id, 最大相似度)
+    """
+    # 步骤1:检索当前查询向量
+    search_params = {
+        "metric_type": "IP",
+        "params": {"nprobe": 10}
+    }
+    results = collection.search(
+        data=[query_vec],
+        anns_field="embedding",
+        param=search_params,
+        limit=topk,  # Milvus最大允许的topk值
+        output_fields=["mongo_id", "path"],
+    )
+    return results[0]
+###############single vector search
+
+
+search_mode = "what_search"
+
+if search_mode =="what_search":
+    ##################添加what_search
+    ########模拟计算出的embedding
+
+    query = '#假如食物会说话'
+
+    # queries_embeddings =  get_basic_embedding(text = query )
+
+    #########暂时代替
+    import numpy as np
+    q_vec = list(np.random.randn(2560))
+    #########暂时代替
+
+    milvus_client = Collection(name="deconstruct_what")
+    milvus_client.load()
+    collection_name = "deconstruct_what"
+
+    search_params = {
+        "metric_type": "IP",
+        "params": {"nprobe": 10}
+    }
+
+    results = search_topk_single(milvus_client,q_vec,topk= 3)
+    print("results is ", results)
+
+    for i,record in enumerate(results):
+        #########暂时代替############
+        if record['mongo_id'] =='10000000':
+            mongo_id = '68f894176a7850acc4851b27'
+        else:
+            mongo_id = record['mongo_id']
+        #########暂时代替############
+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
+        print(f"第{i+1}个结果*********************:{docres}\n")
+
+elif search_mode =="media_search":
+    ##################多模态search
+    ##################
+    queries = os.path.join("../src", "dragon_mother.jpeg")
+    query_image = Image.open(queries)
+    # Process the inputs
+    batch_queries = processor.process_images([query_image]).to(model.device)
+    # Forward pass
+    with torch.no_grad():
+        query_embeddings = model(**batch_queries)
+
+    ##################添加media_search
+    milvus_client = Collection(name="deconstruct_media")
+    milvus_client.load()
+    collection_name = "deconstruct_media"
+
+    query_embeddings = query_embeddings.cpu().to(dtype=torch.float32).numpy().tolist()
+    query_embeddings=query_embeddings[0]
+    scores = search_topk_multi(milvus_client,query_embeddings,topk= 3)
+
+    print("search_topk_multi结果:",scores)
+    ####输出结果
+    for i,record in enumerate(scores):
+        docres = coll.find_one({"_id": ObjectId(record[0])})
+        print(f"第{i+1}个结果*********************:{docres}\n")
+    
+
+
+
+

+ 194 - 0
deconstruct_SQI/milvus_how_insert.py

@@ -0,0 +1,194 @@
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+
+from pydub import AudioSegment
+import io
+from scipy.io import wavfile
+import numpy as np
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+################################连接Embedding service A
+# 注意:根据之前的讨论,需要通过SSH隧道将远程服务转发到本地
+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
+
+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
+    """通过HTTP调用在线embedding服务"""
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model,
+        "input": text
+    }
+    
+    response = requests.post(
+        VLLM_SERVER_URL,
+        headers=headers,
+        json=data,
+        timeout=5  # 添加超时设置
+    )
+    response.raise_for_status()  # 如果状态码不是200,抛出异常
+    result = response.json()
+    return result["data"][0]["embedding"]
+
+def parse_how_res(json_data) -> Dict[str, Dict[str, str]]:
+    """
+    解析 how_res.json 文件,提取两类信息:
+    1. 所有 "how","why" 字段的 path 与 value 映射
+    
+    返回:
+        {
+            "how": {path: value, ...},
+            "why": {path: value, ...}
+        }
+    """
+    data = json_data
+    how_dict: Dict[str, Any] = {}
+    why_dict: Dict[str, Any] = {}
+
+    def traverse(obj: Any, current_path: str = ""):
+        """递归遍历 JSON 结构,记录目标字段"""
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                # 构建新路径,避免在开头添加点号
+                new_path = f"{current_path}.{k}" if current_path else k
+                
+                if k == "how":
+                    how_dict[new_path] = v
+                elif k == "why":
+                    why_dict[new_path] = v
+                
+                # 继续递归遍历
+                traverse(v, new_path)
+        
+        elif isinstance(obj, list):
+            for idx, item in enumerate(obj):
+                # 对于数组元素,使用方括号索引
+                new_path = f"{current_path}[{idx}]"
+                traverse(item, new_path)
+
+    traverse(data)
+    return {"how": how_dict, "why": why_dict}
+
+# 使用示例
+if __name__ == "__main__":
+    
+    # 连接 MongoDB 数据库
+    ##################### 存储到mongoDB
+
+    MONGO_URI = "mongodb://localhost:27017/"
+    DB_NAME = "mydeconstruct"
+    COLL_NAME = "deconstruct_how"
+
+    client = MongoClient(MONGO_URI)
+    db = client[DB_NAME]
+    coll = db[COLL_NAME]
+
+    # 读取并插入 JSON 文件
+    json_path = "/home/ecs-user/project/colpali/src/how_res.json"
+
+    with open(json_path, "r", encoding="utf-8") as f:
+        doc = json.load(f)
+
+    result = parse_how_res(doc)
+    # print("how 字段映射:", result["how"])
+    # print("why 字段映射:", result["why"])
+
+    for key, value in result["how"].items():
+        print(f"how 字段 {key} 的值为: {value}")
+
+    for key, value in result["why"].items():
+        print(f"why 字段 {key} 的值为: {value}")
+
+    insert_result = coll.insert_one(doc)
+    inserted_id = insert_result.inserted_id
+
+    ##################### 将 result["how"] 中的每个 value 转换为向量并插入 Milvus
+    ########## 文本向量库存一份how
+    # 创建 Milvus 集合(如不存在)
+    collection_name = "deconstruct_how"
+    if not utility.has_collection(collection_name): 
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
+        ]
+        schema = CollectionSchema(fields, description="Deconstruct how embeddings")
+        collection = Collection(name=collection_name, schema=schema)
+        # 创建 IVF_FLAT 索引
+        index_params = {
+            "metric_type": "IP",
+            "index_type": "IVF_FLAT",
+            "params": {"nlist": 128}
+        }
+        collection.create_index("embedding", index_params)
+    else:
+        collection = Collection(name=collection_name)
+
+    entities = []
+    for key, value in result["how"].items():
+
+        ### 访问可达则替换
+        # embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
+        ###
+        embedding = np.random.rand(2560).tolist()
+
+        path = key
+        entities.append({
+            "mongo_id": str(inserted_id),
+            "type": "how", 
+            "path": path,
+            "embedding": embedding
+        })
+
+    # 遍历 result["why"],生成 embeddings 并插入 Milvus
+    if entities:
+        collection.insert(entities)
+        collection.flush()
+        print(f"已插入 {len(entities)} 条 how 字段向量到 Milvus")
+    else:
+        print("未找到 how 字段,未插入向量")
+
+    entities = []
+    for key, value in result["why"].items():
+
+        # embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
+        
+        embedding = np.random.rand(2560).tolist()
+
+        path = key
+        entities.append({
+            "mongo_id": str(inserted_id),
+            "type": "why", 
+            "path": path,
+            "embedding": embedding
+        })
+
+    if entities:
+        collection.insert(entities)
+        collection.flush()
+        print(f"已插入 {len(entities)} 条 why 字段向量到 Milvus")
+    else:
+        print("未找到 why 字段,未插入向量")
+
+
+
+

+ 116 - 0
deconstruct_SQI/milvus_how_query.py

@@ -0,0 +1,116 @@
+from ast import Import
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+from bson import ObjectId
+from pydub import AudioSegment
+import io
+from scipy.io import wavfile
+
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+##################### mongoDB
+MONGO_URI = "mongodb://localhost:27017/"
+DB_NAME = "mydeconstruct"
+COLL_NAME = "deconstruct_how"
+
+client = MongoClient(MONGO_URI)
+db = client[DB_NAME]
+coll = db[COLL_NAME]
+##################### mongoDB
+
+##################### 路径解析返回
+def resolve_mongo_path(mongo_id: str, path: str):
+    """
+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串,
+    从 MongoDB 中定位并返回对应的对象。
+    """
+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
+    if not doc:
+        return None
+
+    # 将路径按 '.' 分割,逐级访问
+    parts = path.split('.')
+    current = doc
+    for part in parts:
+        # 处理数组索引,如 子节点元素[0]
+        if '[' in part and part.endswith(']'):
+            key, idx_str = part.split('[', 1)
+            idx = int(idx_str[:-1])  # 去掉 ']'
+            current = current[key][idx]
+        else:
+            current = current[part]
+    return current
+##################### 路径解析返回
+
+search_mode ="why_search" # "why_search"
+
+if search_mode == "how_search":
+    ##################query what
+    ##################
+    milvus_client = Collection(name="deconstruct_how")
+    milvus_client.load()
+    collection_name = "deconstruct_how"
+    if not utility.has_collection(collection_name):
+        print(f"no collection named {collection_name}")
+    else:
+        # 查询并打印 collection 中的所有记录
+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
+        try:
+            # 使用 query 方法获取所有记录,不设置过滤条件
+            all_records = milvus_client.query(
+                expr="mongo_id >\"10000000\" and type == \"how\"",  # 空表达式表示查询所有
+                output_fields=["mongo_id","type","path"],  # 输出所有字段
+                limit=10  # 设置一个较大的上限,确保能获取全部
+            )
+            print(f"共查询到 {len(all_records)} 条记录:")
+            for record in all_records:
+                print(record)
+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
+                print("定位items:",rec)
+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
+                # print(docres)
+        except Exception as e:
+            print(f"查询失败:{e}")
+    ##############all_records返回存储的每个record, rec返回解析后的对象
+elif search_mode == "why_search":
+    ##################query why
+    ##################
+    milvus_client = Collection(name="deconstruct_how")
+    milvus_client.load()
+    collection_name = "deconstruct_how"
+    if not utility.has_collection(collection_name):
+        print(f"no collection named {collection_name}")
+    else:
+        # 查询并打印 collection 中的所有记录
+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
+        try:
+            # 使用 query 方法获取所有记录,不设置过滤条件
+            all_records = milvus_client.query(
+                expr="mongo_id >\"10000000\" and type == \"why\"",  # 空表达式表示查询所有
+                output_fields=["mongo_id","type","path"],  # 输出所有字段
+                limit=10  # 设置一个较大的上限,确保能获取全部
+            )
+            print(f"共查询到 {len(all_records)} 条记录:")
+            for record in all_records:
+                print(record)
+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
+                print("定位items:",rec)
+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
+                # print(docres)
+        except Exception as e:
+            print(f"查询失败:{e}")
+    ##############all_records返回存储的每个record, rec返回解析后的对象

+ 241 - 0
deconstruct_SQI/milvus_how_search.py

@@ -0,0 +1,241 @@
+from ast import Import
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+from bson import ObjectId
+from pydub import AudioSegment
+import io, os
+from scipy.io import wavfile
+import numpy
+
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+##################################引入多模态向量模型#################
+import torch
+from PIL import Image
+from transformers.utils.import_utils import is_flash_attn_2_available
+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
+
+model = ColQwen2_5Omni.from_pretrained(
+    "vidore/colqwen-omni-v0.1",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda",  # or "mps" if on Apple Silicon
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
+).eval()
+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
+##################################引入多模态向量模型#################
+
+##################### mongoDB
+MONGO_URI = "mongodb://localhost:27017/"
+DB_NAME = "mydeconstruct"
+COLL_NAME = "deconstruct_how"
+
+client = MongoClient(MONGO_URI)
+db = client[DB_NAME]
+coll = db[COLL_NAME]
+##################### mongoDB
+
+#####################text embedding serviceS
+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
+
+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
+    """通过HTTP调用在线embedding服务"""
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model,
+        "input": text
+    } 
+    response = requests.post(
+        VLLM_SERVER_URL,
+        headers=headers,
+        json=data,
+        timeout=5  # 添加超时设置
+    )
+    response.raise_for_status()  # 如果状态码不是200,抛出异常
+    result = response.json()
+    return result["data"][0]["embedding"]
+#####################text embedding serviceS
+#####################multi vector search
+import numpy as np
+from collections import defaultdict
+from typing import List, Dict, Tuple
+
+###############multi vector search
+def search_topk_multi(
+    collection: Collection,
+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
+    topk: int = 2
+) -> List[Tuple[str, float]]:
+    """
+    对查询向量列表检索,计算每个对象的平均最大相似度,返回 TopK 对象
+    
+    参数:
+        collection: Milvus 集合实例
+        query_vecs: 查询向量列表(每个向量维度需与集合一致)
+        topk: 返回的 top 数量
+    
+    返回:
+        排序后的列表,元素为 (object_id, 平均最大相似度)
+    """
+    # 步骤1:逐个检索查询向量,收集每个对象的最大相似度
+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
+        # 检索当前查询向量
+        search_params = {
+            "metric_type": "IP",
+            "params": {"nprobe": 10}
+        }
+        results = collection.search(
+            data=[q_vec],
+            anns_field="embedding",
+            param=search_params,
+            limit=16384,  # Milvus最大允许的topk值
+            output_fields=["mongo_id", "type", "path"],
+            expr='type == "image"'  # 只检索type为text的记录
+        )
+        # 按 object_id 分组取最大相似度
+        query_object_sim = defaultdict(float)
+        for hit in results[0]:
+            obj_id = hit.entity.get("mongo_id")
+            sim = hit.score
+            if sim > query_object_sim[obj_id]:
+                query_object_sim[obj_id] = sim
+        
+        all_query_results.append(query_object_sim)
+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成,覆盖 {len(query_object_sim)} 个对象")
+    
+    # 步骤2:计算每个对象的平均最大相似度
+    all_object_ids = set()
+    for res in all_query_results:
+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
+    
+    object_avg_sim = {}
+    for obj_id in all_object_ids:
+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
+        object_avg_sim[obj_id] = avg_sim
+    
+    # 步骤3:按平均相似度排序并取 TopK
+    sorted_objects = sorted(
+        object_avg_sim.items(),
+        key=lambda x: x[1],
+        reverse=True
+    )[:topk]
+    return sorted_objects
+##################文本数据库 search
+###############single vector search
+def search_topk_single(
+    collection: Collection,
+    query_vec: List[float],  # 查询向量
+    topk: int = 2,
+    expr='type=="why"',
+) -> List[dict]:
+    """
+    对单个查询向量检索,计算每个对象的最大相似度,返回 TopK 对象
+    
+    参数:
+        collection: Milvus 集合实例
+        query_vec: 查询向量(维度需与集合一致)
+        topk: 返回的 top 数量
+    
+    返回:
+        排序后的列表,元素为 (object_id, 最大相似度)
+    """
+    # 步骤1:检索当前查询向量
+    search_params = {
+        "metric_type": "IP",
+        "params": {"nprobe": 10}
+    }
+    results = collection.search(
+        data=[query_vec],
+        anns_field="embedding",
+        param=search_params,
+        limit=topk,  # Milvus最大允许的topk值
+        output_fields=["mongo_id","type","path"],
+        expr= expr
+    )
+    return results[0]
+###############single vector search
+
+search_mode = "how_search" #"why_search"
+
+if search_mode =="how_search":
+    ##################添加what_search
+    ########模拟计算出的embedding
+
+    query = '#假如食物会说话'
+    
+    # queries_embeddings =  get_basic_embedding(text = query )
+    #########暂时代替
+    import numpy as np
+    q_vec = list(np.random.randn(2560))
+    #########暂时代替
+
+    milvus_client = Collection(name="deconstruct_how")
+    milvus_client.load()
+    collection_name = "deconstruct_how"
+
+    search_params = {
+        "metric_type": "IP",
+        "params": {"nprobe": 10}
+    }
+
+    results = search_topk_single(milvus_client,q_vec,topk= 3,expr = 'type=="how"')
+    print("results is ", results)
+    for i,record in enumerate(results):
+        #########暂时代替############
+        if record['mongo_id'] =='10000000':
+            mongo_id = '68f894176a7850acc4851b27'
+        else:
+            mongo_id = record['mongo_id']
+        #########暂时代替############
+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
+        # print(f"第{i+1}个结果*********************:{docres}\n")
+
+elif search_mode =="why_search":
+    ##################添加what_search
+    ########模拟计算出的embedding
+    query = '#假如食物会说话'
+    # queries_embeddings =  get_basic_embedding(text = query )
+    #########暂时代替
+    import numpy as np
+    q_vec = list(np.random.randn(2560))
+    #########暂时代替
+
+    milvus_client = Collection(name="deconstruct_how")
+    milvus_client.load()
+    collection_name = "deconstruct_how"
+
+    search_params = {
+        "metric_type": "IP",
+        "params": {"nprobe": 10}
+    }
+
+    results = search_topk_single(milvus_client,q_vec,topk= 3,expr = 'type=="why"')
+    print("results is ", results)
+
+    for i,record in enumerate(results):
+        #########暂时代替############
+        if record['mongo_id'] =='10000000':
+            mongo_id = '68f894176a7850acc4851b27'
+        else:
+            mongo_id = record['mongo_id']
+        #########暂时代替############
+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
+        print(f"第{i+1}个结果*********************:{docres}\n")

+ 184 - 0
deconstruct_SQI/milvus_pattern_insert.py

@@ -0,0 +1,184 @@
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+
+from pydub import AudioSegment
+import io
+from scipy.io import wavfile
+import numpy as np
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+################################连接Embedding service A
+# 注意:根据之前的讨论,需要通过SSH隧道将远程服务转发到本地
+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
+
+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
+    """通过HTTP调用在线embedding服务"""
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model,
+        "input": text
+    }
+    
+    response = requests.post(
+        VLLM_SERVER_URL,
+        headers=headers,
+        json=data,
+        timeout=5  # 添加超时设置
+    )
+    response.raise_for_status()  # 如果状态码不是200,抛出异常
+    result = response.json()
+    return result["data"][0]["embedding"]
+
+def parse_pattern_res(json_data) -> Dict[str, Dict[str, str]]:
+    """
+    解析 pattern_res.json 文件,提取两类信息:
+    1. 所有 "模式ID","模式命名","模式说明" 字段的 path 与 value 映射
+    
+    返回:
+        {
+            "模式ID": {path: value, ...},
+            "模式命名": {path: value, ...},
+            "模式说明": {path: value, ...}
+        }
+    """
+    data = json_data
+    pattern_dict: Dict[str, Any] = {}
+    def traverse(obj: Any, current_path: str = ""):
+        """递归遍历 JSON 结构,记录目标字段"""
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                # 构建新路径,避免在开头添加点号
+                new_path = f"{current_path}.{k}" if current_path else k
+                if k == "模式ID":
+                    # 当遇到“模式ID”时,同时获取同层的“模式命名”和“模式描述”
+                    temp_dict ={}
+                    temp_dict["模式ID"] = v
+                    temp_dict["模式命名"] = obj.get("模式命名", "")
+                    temp_dict["模式说明"] = obj.get("模式说明", "")
+
+                    pattern_dict[current_path] = temp_dict
+
+                traverse(v, new_path)
+        
+        elif isinstance(obj, list):
+            for idx, item in enumerate(obj):
+                # 对于数组元素,使用方括号索引
+                new_path = f"{current_path}[{idx}]"
+                traverse(item, new_path)
+    traverse(data)
+    return {"pattern": pattern_dict}
+
+# 使用示例
+if __name__ == "__main__":
+    # 连接 MongoDB 数据库
+    ##################### 存储到mongoDB
+    MONGO_URI = "mongodb://localhost:27017/"
+    DB_NAME = "mydeconstruct"
+    COLL_NAME = "deconstruct_how"
+
+    client = MongoClient(MONGO_URI)
+    db = client[DB_NAME]
+    coll = db[COLL_NAME]
+
+    # 读取并插入 JSON 文件
+    json_path = "/home/ecs-user/project/colpali/src/pattern_res.json"
+
+    with open(json_path, "r", encoding="utf-8") as f:
+        doc = json.load(f)
+
+    result = parse_pattern_res(doc)
+
+    for key, value in result["pattern"].items():
+        print(f"pattern 字段 {key} 的值为: {value}")
+
+    # exit()
+    insert_result = coll.insert_one(doc)
+    inserted_id = insert_result.inserted_id
+
+    ##################### 将 result["how"] 中的每个 value 转换为向量并插入 Milvus
+    ########## 文本向量库存一份how
+    # 创建 Milvus 集合(如不存在)
+    collection_name = "deconstruct_pattern"
+    if not utility.has_collection(collection_name): 
+        # utility.drop_collection(collection_name)
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="pattern_id", dtype=DataType.VARCHAR, max_length=64),
+            FieldSchema(name="pattern_name", dtype=DataType.VARCHAR, max_length=128),
+            FieldSchema(name="pattern_desc", dtype=DataType.VARCHAR, max_length=2048),
+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
+            FieldSchema(name="name_embedding", dtype=DataType.FLOAT_VECTOR, dim=2560),
+            FieldSchema(name="desc_embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
+        ]
+        schema = CollectionSchema(fields, description="Deconstruct how embeddings")
+        collection = Collection(name=collection_name, schema=schema)
+        # 创建 IVF_FLAT 索引
+        index_params = {
+            "metric_type": "IP",
+            "index_type": "IVF_FLAT",
+            "params": {"nlist": 128}
+        }
+
+        # 为 pattern_id 字段创建字符串索引
+        collection.create_index("pattern_id", {
+            "index_type": "INVERTED" #"Trie"
+        })
+        collection.create_index("name_embedding", index_params)
+        collection.create_index("desc_embedding", index_params)
+    else:
+        collection = Collection(name=collection_name)
+
+    entities = []
+    for key, value in result["pattern"].items():
+        pattern_id = value["模式ID"]
+        pattern_name = value["模式命名"]
+        pattern_desc = value["模式说明"]
+
+        ### 访问可达则替换
+        # name_embedding = get_basic_embedding(pattern_name, model=DEFAULT_MODEL)
+        # desc_embedding = get_basic_embedding(pattern_desc, model=DEFAULT_MODEL)
+        ###
+        name_embedding = np.random.rand(2560).tolist()
+        desc_embedding = np.random.rand(2560).tolist()
+
+        path = key
+        entities.append({
+            "mongo_id": str(inserted_id),
+            "pattern_id": pattern_id, 
+            "pattern_name": pattern_name,
+            "pattern_desc": pattern_desc,
+            "path": path,
+            "name_embedding": name_embedding,
+            "desc_embedding": desc_embedding
+        })
+    # 遍历 result["pattern"],生成 embeddings 并插入 Milvus
+    # print("entities is ", entities)
+    if entities:
+        collection.insert(entities)
+        collection.flush()
+        print(f"已插入 {len(entities)} 条 how 字段向量到 Milvus")
+    else:
+        print("未找到 how 字段,未插入向量")
+
+
+
+

+ 87 - 0
deconstruct_SQI/milvus_pattern_query.py

@@ -0,0 +1,87 @@
+from ast import Import
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+from bson import ObjectId
+from pydub import AudioSegment
+import io
+from scipy.io import wavfile
+
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+##################### mongoDB
+MONGO_URI = "mongodb://localhost:27017/"
+DB_NAME = "mydeconstruct"
+COLL_NAME = "deconstruct_how"
+
+client = MongoClient(MONGO_URI)
+db = client[DB_NAME]
+coll = db[COLL_NAME]
+##################### mongoDB
+
+##################### 路径解析返回
+def resolve_mongo_path(mongo_id: str, path: str):
+    """
+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串,
+    从 MongoDB 中定位并返回对应的对象。
+    """
+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
+    if not doc:
+        return None
+
+    # 将路径按 '.' 分割,逐级访问
+    parts = path.split('.')
+    current = doc
+    for part in parts:
+        # 处理数组索引,如 子节点元素[0]
+        if '[' in part and part.endswith(']'):
+            key, idx_str = part.split('[', 1)
+            idx = int(idx_str[:-1])  # 去掉 ']'
+            current = current[key][idx]
+        else:
+            current = current[part]
+    return current
+##################### 路径解析返回
+
+search_mode ="id_search"  # "name_search" ,"desc_search", "id_search"
+
+if search_mode == "id_search":
+    ##################query what
+    ##################
+    milvus_client = Collection(name="deconstruct_pattern")
+    milvus_client.load()
+    collection_name = "deconstruct_pattern"
+    if not utility.has_collection(collection_name):
+        print(f"no collection named {collection_name}")
+    else:
+        # 查询并打印 collection 中的所有记录
+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
+        try:
+            # 使用 query 方法获取所有记录,不设置过滤条件
+            all_records = milvus_client.query(
+                expr="pattern_id =='pattern_制作-图集-形容词_2' ",  # 空表达式表示查询所有
+                output_fields=["mongo_id","pattern_id","pattern_name","pattern_desc","path"],  # 输出所有字段
+                limit=10  # 设置一个较大的上限,确保能获取全部
+            )
+            print(f"共查询到 {len(all_records)} 条记录:")
+            for record in all_records:
+                print(record)
+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
+                print("定位items:",rec)
+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
+                # print(docres)
+        except Exception as e:
+            print(f"查询失败:{e}")

+ 207 - 0
deconstruct_SQI/milvus_pattern_search.py

@@ -0,0 +1,207 @@
+from ast import Import
+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
+import requests
+import json
+from typing import Dict, Any, List
+from pymongo import MongoClient
+from bson import ObjectId
+from pydub import AudioSegment
+import io, os
+from scipy.io import wavfile
+import numpy
+
+################################连接milvus数据库 A
+# 配置信息
+MILVUS_CONFIG = {
+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
+    "user": "root",
+    "password": "Piaoquan@2025",
+    "port": "19530",
+}
+print("正在连接 Milvus 数据库...")
+connections.connect("default", **MILVUS_CONFIG)
+print("连接成功!")
+################################连接milvus数据库 B
+
+# ##################################引入多模态向量模型#################
+# import torch
+# from PIL import Image
+# from transformers.utils.import_utils import is_flash_attn_2_available
+# from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
+
+# model = ColQwen2_5Omni.from_pretrained(
+#     "vidore/colqwen-omni-v0.1",
+#     torch_dtype=torch.bfloat16,
+#     device_map="cuda",  # or "mps" if on Apple Silicon
+#     attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
+# ).eval()
+# processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
+# ##################################引入多模态向量模型#################
+
+##################### mongoDB
+MONGO_URI = "mongodb://localhost:27017/"
+DB_NAME = "mydeconstruct"
+COLL_NAME = "deconstruct_how"
+
+client = MongoClient(MONGO_URI)
+db = client[DB_NAME]
+coll = db[COLL_NAME]
+##################### mongoDB
+
+#####################text embedding serviceS
+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
+
+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
+    """通过HTTP调用在线embedding服务"""
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model,
+        "input": text
+    } 
+    response = requests.post(
+        VLLM_SERVER_URL,
+        headers=headers,
+        json=data,
+        timeout=5  # 添加超时设置
+    )
+    response.raise_for_status()  # 如果状态码不是200,抛出异常
+    result = response.json()
+    return result["data"][0]["embedding"]
+#####################text embedding serviceS
+#####################multi vector search
+import numpy as np
+from collections import defaultdict
+from typing import List, Dict, Tuple
+
+###############multi vector search
+def search_topk_multi(
+    collection: Collection,
+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
+    topk: int = 2
+) -> List[Tuple[str, float]]:
+    """
+    对查询向量列表检索,计算每个对象的平均最大相似度,返回 TopK 对象
+    
+    参数:
+        collection: Milvus 集合实例
+        query_vecs: 查询向量列表(每个向量维度需与集合一致)
+        topk: 返回的 top 数量
+    
+    返回:
+        排序后的列表,元素为 (object_id, 平均最大相似度)
+    """
+    # 步骤1:逐个检索查询向量,收集每个对象的最大相似度
+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
+        # 检索当前查询向量
+        search_params = {
+            "metric_type": "IP",
+            "params": {"nprobe": 10}
+        }
+        results = collection.search(
+            data=[q_vec],
+            anns_field="embedding",
+            param=search_params,
+            limit=16384,  # Milvus最大允许的topk值
+            output_fields=["mongo_id", "type", "path"],
+            expr='type == "image"'  # 只检索type为text的记录
+        )
+        # 按 object_id 分组取最大相似度
+        query_object_sim = defaultdict(float)
+        for hit in results[0]:
+            obj_id = hit.entity.get("mongo_id")
+            sim = hit.score
+            if sim > query_object_sim[obj_id]:
+                query_object_sim[obj_id] = sim
+        
+        all_query_results.append(query_object_sim)
+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成,覆盖 {len(query_object_sim)} 个对象")
+    
+    # 步骤2:计算每个对象的平均最大相似度
+    all_object_ids = set()
+    for res in all_query_results:
+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
+    
+    object_avg_sim = {}
+    for obj_id in all_object_ids:
+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
+        object_avg_sim[obj_id] = avg_sim
+    
+    # 步骤3:按平均相似度排序并取 TopK
+    sorted_objects = sorted(
+        object_avg_sim.items(),
+        key=lambda x: x[1],
+        reverse=True
+    )[:topk]
+    return sorted_objects
+##################文本数据库 search
+###############single vector search
+def search_topk_single(
+    collection: Collection,
+    query_vec: List[float],  # 查询向量
+    topk: int = 2
+) -> List[dict]:
+    """
+    对单个查询向量检索,计算每个对象的最大相似度,返回 TopK 对象
+    
+    参数:
+        collection: Milvus 集合实例
+        query_vec: 查询向量(维度需与集合一致)
+        topk: 返回的 top 数量
+    
+    返回:
+        排序后的列表,元素为 (object_id, 最大相似度)
+    """
+    # 步骤1:检索当前查询向量
+    search_params = {
+        "metric_type": "IP",
+        "params": {"nprobe": 10}
+    }
+    results = collection.search(
+        data=[query_vec],
+        anns_field="name_embedding", #'desc_embedding'
+        param=search_params,
+        limit=topk,  # Milvus最大允许的topk值
+        output_fields=["mongo_id","pattern_id","pattern_name","pattern_desc","path"]
+    )
+    return results[0]
+###############single vector search
+
+search_mode = "name_search" #"desc_search"
+
+if search_mode =="name_search":
+    ##################添加what_search
+    ########模拟计算出的embedding
+
+    query = '#假如食物会说话'
+    
+    # queries_embeddings =  get_basic_embedding(text = query )
+    #########暂时代替
+    import numpy as np
+    q_vec = list(np.random.randn(2560))
+    #########暂时代替
+
+    milvus_client = Collection(name="deconstruct_pattern")
+    milvus_client.load()
+    collection_name = "deconstruct_pattern"
+
+    search_params = {
+        "metric_type": "IP",
+        "params": {"nprobe": 10}
+    }
+
+    results = search_topk_single(milvus_client,q_vec,topk= 3)
+    print("results is ", results)
+    for i,record in enumerate(results):
+        #########暂时代替############
+        if record['mongo_id'] =='10000000':
+            mongo_id = '68f894176a7850acc4851b27'
+        else:
+            mongo_id = record['mongo_id']
+        #########暂时代替############
+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
+        # print(f"第{i+1}个结果*********************:{docres}\n")