hai 1 mes · e836747eee
--- a/deconstruct_SQI/milvus_deconstruct_insert.py
+++ b/deconstruct_SQI/milvus_deconstruct_insert.py
@@ -0,0 +1,312 @@
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+from pydub import AudioSegment
			
 
				+import io
			
 
				+from scipy.io import wavfile
			
 
				+
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+##################################引入多模态模型#################
			
 
				+import torch
			
 
				+from PIL import Image
			
 
				+from transformers.utils.import_utils import is_flash_attn_2_available
			
 
				+
			
 
				+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
			
 
				+
			
 
				+model = ColQwen2_5Omni.from_pretrained(
			
 
				+    "vidore/colqwen-omni-v0.1",
			
 
				+    torch_dtype=torch.bfloat16,
			
 
				+    device_map="cuda",  # or "mps" if on Apple Silicon
			
 
				+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
			
 
				+).eval()
			
 
				+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
			
 
				+##################################引入模型#################
			
 
				+
			
 
				+################################连接Embedding service A
			
 
				+# 注意：根据之前的讨论，需要通过SSH隧道将远程服务转发到本地
			
 
				+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
			
 
				+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
 
				+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
			
 
				+
			
 
				+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
			
 
				+    """通过HTTP调用在线embedding服务"""
			
 
				+    headers = {
			
 
				+        "Content-Type": "application/json"
			
 
				+    }
			
 
				+    data = {
			
 
				+        "model": model,
			
 
				+        "input": text
			
 
				+    }
			
 
				+    
			
 
				+    response = requests.post(
			
 
				+        VLLM_SERVER_URL,
			
 
				+        headers=headers,
			
 
				+        json=data,
			
 
				+        timeout=5  # 添加超时设置
			
 
				+    )
			
 
				+    response.raise_for_status()  # 如果状态码不是200，抛出异常
			
 
				+    result = response.json()
			
 
				+    return result["data"][0]["embedding"]
			
 
				+
			
 
				+def get_media_embedding(query: str, type: str):
			
 
				+    '''
			
 
				+    query 是查询字符串或文件路径
			
 
				+    type 是查询类型，可选值为 "audio", "image", "video", "text"
			
 
				+    k 是返回的结果数量，默认值为 3
			
 
				+    audio image video 的query为路径
			
 
				+    text的query为问题本身
			
 
				+    '''
			
 
				+    if type =="audio":
			
 
				+        batch_queries = processor.process_audios([query]).to(model.device)
			
 
				+
			
 
				+    elif type =="image":
			
 
				+        query_image = Image.open(query)
			
 
				+        batch_queries = processor.process_images([query_image]).to(model.device)
			
 
				+    elif type =="video":
			
 
				+        batch_queries = processor.process_videos([query]).to(model.device)   
			
 
				+    elif type =="text":
			
 
				+        batch_queries = processor.process_queries([query]).to(model.device)
			
 
				+    # Forward pass
			
 
				+    with torch.no_grad():
			
 
				+        query_embeddings = model(**batch_queries)
			
 
				+    return query_embeddings
			
 
				+    # # scores = processor.score_multi_vector(query_embeddings, ds)
			
 
				+    # print("score is ", scores)
			
 
				+    # # get top-5 scores
			
 
				+    # return scores[0].topk(k).indices.tolist()
			
 
				+
			
 
				+# ################################连接Embedding service B
			
 
				+
			
 
				+def parse_deconstruct_res(json_data) -> Dict[str, Dict[str, str]]:
			
 
				+    """
			
 
				+    解析 deconstruct_res.json 文件，提取两类信息：
			
 
				+    1. 所有 "what" 字段的 path 与 value 映射
			
 
				+    2. 所有类型为 "image" 或 "video" 的媒体引用 path 与 content 值映射
			
 
				+    
			
 
				+    返回:
			
 
				+        {
			
 
				+            "what": {path: value, ...},
			
 
				+            "media": {path: value, ...}
			
 
				+        }
			
 
				+    """
			
 
				+    data = json_data
			
 
				+    what_dict: Dict[str, Any] = {}
			
 
				+    media_dict: Dict[str, Any] = {}
			
 
				+
			
 
				+    def traverse(obj: Any, current_path: str = ""):
			
 
				+        """递归遍历 JSON 结构，记录目标字段"""
			
 
				+        if isinstance(obj, dict):
			
 
				+            for k, v in obj.items():
			
 
				+                # 构建新路径，避免在开头添加点号
			
 
				+                new_path = f"{current_path}.{k}" if current_path else k
			
 
				+                
			
 
				+                if k == "what":
			
 
				+                    what_dict[new_path] = v
			
 
				+                # 处理媒体引用字段
			
 
				+                elif k == "媒体引用" and isinstance(v, list):
			
 
				+                    # 遍历媒体引用数组
			
 
				+                    for idx, media_item in enumerate(v):
			
 
				+                        if isinstance(media_item, dict) and media_item.get("type") in ("image", "video", "audio"):
			
 
				+                            # 记录content字段作为媒体路径
			
 
				+                            content = media_item.get("content")
			
 
				+                            type_nm = media_item.get("type")
			
 
				+                            if content:
			
 
				+                                # 生成正确格式的路径，如"图片元素[5].媒体引用[0].content"
			
 
				+                                media_ref_path = f"{type_nm}-{new_path}[{idx}].content"
			
 
				+                                media_dict[media_ref_path] = content
			
 
				+                
			
 
				+                # 继续递归遍历
			
 
				+                traverse(v, new_path)
			
 
				+        
			
 
				+        elif isinstance(obj, list):
			
 
				+            for idx, item in enumerate(obj):
			
 
				+                # 对于数组元素，使用方括号索引
			
 
				+                new_path = f"{current_path}[{idx}]"
			
 
				+                traverse(item, new_path)
			
 
				+
			
 
				+    traverse(data)
			
 
				+    return {"what": what_dict, "media": media_dict}
			
 
				+
			
 
				+# 使用示例
			
 
				+if __name__ == "__main__":
			
 
				+    
			
 
				+    # 连接 MongoDB 数据库
			
 
				+    ##################### 存储到mongoDB
			
 
				+
			
 
				+    MONGO_URI = "mongodb://localhost:27017/"
			
 
				+    DB_NAME = "mydeconstruct"
			
 
				+    COLL_NAME = "deconstruct"
			
 
				+
			
 
				+    client = MongoClient(MONGO_URI)
			
 
				+    db = client[DB_NAME]
			
 
				+    coll = db[COLL_NAME]
			
 
				+
			
 
				+    # 读取并插入 JSON 文件
			
 
				+    json_path = "/home/ecs-user/project/colpali/src/deconstruct_res.json"
			
 
				+
			
 
				+    with open(json_path, "r", encoding="utf-8") as f:
			
 
				+        doc = json.load(f)
			
 
				+
			
 
				+    insert_result = coll.insert_one(doc)
			
 
				+    inserted_id = insert_result.inserted_id
			
 
				+    print("已插入 MongoDB，文档 _id：", inserted_id)
			
 
				+
			
 
				+    result = parse_deconstruct_res(doc)
			
 
				+    print("what 字段映射：", result["what"])
			
 
				+    print("媒体引用映射：", result["media"])
			
 
				+
			
 
				+    ##################### 存储到mongoDB
			
 
				+
			
 
				+    ##################### 将 result["what"] 中的每个 value 转换为向量并插入 Milvus
			
 
				+    ########## 文本向量库存一份what
			
 
				+    # 创建 Milvus 集合（如不存在）
			
 
				+    collection_name = "deconstruct_what"
			
 
				+    if not utility.has_collection(collection_name): 
			
 
				+        fields = [
			
 
				+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
			
 
				+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
			
 
				+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
			
 
				+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
			
 
				+        ]
			
 
				+        schema = CollectionSchema(fields, description="Deconstruct what embeddings")
			
 
				+        collection = Collection(name=collection_name, schema=schema)
			
 
				+        # 创建 IVF_FLAT 索引
			
 
				+        index_params = {
			
 
				+            "metric_type": "IP",
			
 
				+            "index_type": "IVF_FLAT",
			
 
				+            "params": {"nlist": 128}
			
 
				+        }
			
 
				+        collection.create_index("embedding", index_params)
			
 
				+    else:
			
 
				+        collection = Collection(name=collection_name)
			
 
				+
			
 
				+    # 遍历 result["what"]，生成 embeddings 并插入 Milvus
			
 
				+    entities = []
			
 
				+    for key, value in result["what"].items():
			
 
				+        embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
			
 
				+        path = key
			
 
				+        entities.append({
			
 
				+            "mongo_id": str(inserted_id),
			
 
				+            "path": path,
			
 
				+            "embedding": embedding
			
 
				+        })
			
 
				+
			
 
				+    if entities:
			
 
				+        collection.insert(entities)
			
 
				+        collection.flush()
			
 
				+        print(f"已插入 {len(entities)} 条 what 字段向量到 Milvus")
			
 
				+    else:
			
 
				+        print("未找到 what 字段，未插入向量")
			
 
				+    ##################### 将 result["what"] 中的每个 value 转换为向量并插入 Milvus
			
 
				+
			
 
				+    #####################将 result["media"] 中的每个 value 调用多模态编码模型计算embedding并插入Milvus
			
 
				+    # 创建 Milvus 集合（如不存在）
			
 
				+    collection_name = "deconstruct_media"
			
 
				+    if not utility.has_collection(collection_name):
			
 
				+        fields = [
			
 
				+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
			
 
				+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
			
 
				+            FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=64),
			
 
				+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
			
 
				+            FieldSchema(name="no", dtype=DataType.INT32),
			
 
				+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
			
 
				+        ]
			
 
				+        schema = CollectionSchema(fields, description="Deconstruct media embeddings")
			
 
				+        collection = Collection(name=collection_name, schema=schema)
			
 
				+        # 创建 IVF_FLAT 索引
			
 
				+        index_params = {
			
 
				+            "metric_type": "IP",
			
 
				+            "index_type": "IVF_FLAT",
			
 
				+            "params": {"nlist": 128}
			
 
				+        }
			
 
				+        collection.create_index("embedding", index_params)
			
 
				+    else:
			
 
				+        collection = Collection(name=collection_name)
			
 
				+        # 遍历 result["media"]，生成 embeddings 并插入 Milvus
			
 
				+    #############存储一份media embedding到Milvus
			
 
				+    entities = []
			
 
				+    for key, value in result["media"].items():
			
 
				+        embedding = get_media_embedding(value, model=DEFAULT_MODEL)
			
 
				+        type = key[:key.index("-")]
			
 
				+        path = key[key.index("-"):]
			
 
				+        # 将 embedding 列表拆分为单条向量，并记录其在原列表中的位置 no
			
 
				+        if isinstance(embedding, list) and len(embedding) > 0:
			
 
				+            for idx, vec in enumerate(embedding):
			
 
				+                entities.append({
			
 
				+                    "mongo_id": str(inserted_id),
			
 
				+                    "type": type,
			
 
				+                    "path": path,
			
 
				+                    "no": idx,
			
 
				+                    "embedding": vec
			
 
				+                })
			
 
				+        else:
			
 
				+            # 若 embedding 不是列表或长度为 0，则 no 记为 0
			
 
				+            entities.append({
			
 
				+                "mongo_id": str(inserted_id),
			
 
				+                "type": type,
			
 
				+                "path": path,
			
 
				+                "no": 0,
			
 
				+                "embedding": embedding
			
 
				+            })
			
 
				+
			
 
				+    # 将插入操作移到循环外部，避免重复插入和数据累积
			
 
				+    if entities:
			
 
				+        collection.insert(entities)
			
 
				+        collection.flush()
			
 
				+        print(f"已插入 {len(entities)} 条 media 字段向量到 Milvus")
			
 
				+    else:
			
 
				+        print("未找到有效的 media 字段向量，未插入数据")
			
 
				+
			
 
				+    #############存储一份what 多模态embedding 到Milvus
			
 
				+    entities = []
			
 
				+    for key, value in result["what"].items():
			
 
				+        embedding = get_media_embedding(value, model=DEFAULT_MODEL)
			
 
				+        # type = key[:key.index("-")]
			
 
				+        # path = key[key.index("-"):]
			
 
				+        path = key
			
 
				+        if isinstance(embedding, list) and len(embedding) > 0:
			
 
				+            for idx, vec in enumerate(embedding):
			
 
				+                entities.append({
			
 
				+                    "mongo_id": str(inserted_id),
			
 
				+                    "type": "text",
			
 
				+                    "path": path,
			
 
				+                    "no": idx,
			
 
				+                    "embedding": vec
			
 
				+                })
			
 
				+        else:
			
 
				+            # 若 embedding 不是列表或长度为 0，则 no 记为 0
			
 
				+            entities.append({
			
 
				+                "mongo_id": str(inserted_id),
			
 
				+                "type": "text",
			
 
				+                "path": path,
			
 
				+                "no": 0,
			
 
				+                "embedding": embedding
			
 
				+            })
			
 
				+
			
 
				+    # 将插入操作移到循环外部，避免重复插入和数据累积
			
 
				+    if entities:
			
 
				+        collection.insert(entities)
			
 
				+        collection.flush()
			
 
				+        print(f"已插入 {len(entities)} 条 what 多模态向量到 Milvus")
			
 
				+    else:
			
 
				+        print("未找到有效的 what 多模态向量，未插入数据")
			
 
				+    #############存储一份what 多模态embedding 到Milvus
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/deconstruct_SQI/milvus_deconstruct_query.py
+++ b/deconstruct_SQI/milvus_deconstruct_query.py
@@ -0,0 +1,114 @@
 
				+from ast import Import
			
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+from bson import ObjectId
			
 
				+from pydub import AudioSegment
			
 
				+import io
			
 
				+from scipy.io import wavfile
			
 
				+
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+##################### mongoDB
			
 
				+MONGO_URI = "mongodb://localhost:27017/"
			
 
				+DB_NAME = "mydeconstruct"
			
 
				+COLL_NAME = "deconstruct"
			
 
				+
			
 
				+client = MongoClient(MONGO_URI)
			
 
				+db = client[DB_NAME]
			
 
				+coll = db[COLL_NAME]
			
 
				+##################### mongoDB
			
 
				+
			
 
				+##################### 路径解析返回
			
 
				+def resolve_mongo_path(mongo_id: str, path: str):
			
 
				+    """
			
 
				+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串，
			
 
				+    从 MongoDB 中定位并返回对应的对象。
			
 
				+    """
			
 
				+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
			
 
				+    if not doc:
			
 
				+        return None
			
 
				+
			
 
				+    # 将路径按 '.' 分割，逐级访问
			
 
				+    parts = path.split('.')
			
 
				+    current = doc
			
 
				+    for part in parts:
			
 
				+        # 处理数组索引，如 子节点元素[0]
			
 
				+        if '[' in part and part.endswith(']'):
			
 
				+            key, idx_str = part.split('[', 1)
			
 
				+            idx = int(idx_str[:-1])  # 去掉 ']'
			
 
				+            current = current[key][idx]
			
 
				+        else:
			
 
				+            current = current[part]
			
 
				+    return current
			
 
				+##################### 路径解析返回
			
 
				+
			
 
				+search_mode ="what_search"
			
 
				+
			
 
				+if search_mode == "what_search":
			
 
				+    ##################query what
			
 
				+    ##################
			
 
				+    milvus_client = Collection(name="deconstruct_what")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_what"
			
 
				+    if not utility.has_collection(collection_name):
			
 
				+        print(f"no collection named {collection_name}")
			
 
				+    else:
			
 
				+        # 查询并打印 collection 中的所有记录
			
 
				+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
			
 
				+        try:
			
 
				+            # 使用 query 方法获取所有记录，不设置过滤条件
			
 
				+            all_records = milvus_client.query(
			
 
				+                expr="mongo_id >\"10000000\"",  # 空表达式表示查询所有
			
 
				+                output_fields=["mongo_id","path"],  # 输出所有字段
			
 
				+                limit=10000  # 设置一个较大的上限，确保能获取全部
			
 
				+            )
			
 
				+            print(f"共查询到 {len(all_records)} 条记录：")
			
 
				+            for record in all_records:
			
 
				+                print(record)
			
 
				+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
			
 
				+                print("定位items：",rec)
			
 
				+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
			
 
				+                # print(docres)
			
 
				+        except Exception as e:
			
 
				+            print(f"查询失败：{e}")
			
 
				+    ##############all_records返回存储的每个record， rec返回解析后的对象
			
 
				+elif search_mode == "media_search":
			
 
				+    ##################query media
			
 
				+    ##################
			
 
				+    milvus_client = Collection(name="deconstruct_media")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_media"
			
 
				+    if not utility.has_collection(collection_name):
			
 
				+        print(f"no collection named {collection_name}")
			
 
				+    else:
			
 
				+        # 查询并打印 collection 中的所有记录
			
 
				+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
			
 
				+        try:
			
 
				+            # 使用 query 方法获取所有记录，不设置过滤条件
			
 
				+            all_records = milvus_client.query( 
			
 
				+                expr="type==\"text\"",  # 空表达式表示查询所有
			
 
				+                output_fields=["mongo_id","path","type"],  # 输出所有字段
			
 
				+                limit=100  # 设置一个较大的上限，确保能获取全部
			
 
				+            )
			
 
				+            print(f"共查询到 {len(all_records)} 条记录：")
			
 
				+            for record in all_records:
			
 
				+                print(record)
			
 
				+                docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
			
 
				+                print(docres)
			
 
				+        except Exception as e:
			
 
				+            print(f"查询失败：{e}")
			
 
				+
			
--- a/deconstruct_SQI/milvus_deconstruct_search.py
+++ b/deconstruct_SQI/milvus_deconstruct_search.py
@@ -0,0 +1,243 @@
 
				+from ast import Import
			
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+from bson import ObjectId
			
 
				+from pydub import AudioSegment
			
 
				+import io, os
			
 
				+from scipy.io import wavfile
			
 
				+import numpy
			
 
				+
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+##################################引入多模态向量模型#################
			
 
				+import torch
			
 
				+from PIL import Image
			
 
				+from transformers.utils.import_utils import is_flash_attn_2_available
			
 
				+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
			
 
				+
			
 
				+model = ColQwen2_5Omni.from_pretrained(
			
 
				+    "vidore/colqwen-omni-v0.1",
			
 
				+    torch_dtype=torch.bfloat16,
			
 
				+    device_map="cuda",  # or "mps" if on Apple Silicon
			
 
				+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
			
 
				+).eval()
			
 
				+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
			
 
				+##################################引入多模态向量模型#################
			
 
				+
			
 
				+##################### mongoDB
			
 
				+MONGO_URI = "mongodb://localhost:27017/"
			
 
				+DB_NAME = "mydeconstruct"
			
 
				+COLL_NAME = "deconstruct"
			
 
				+
			
 
				+client = MongoClient(MONGO_URI)
			
 
				+db = client[DB_NAME]
			
 
				+coll = db[COLL_NAME]
			
 
				+##################### mongoDB
			
 
				+
			
 
				+#####################text embedding serviceS
			
 
				+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
 
				+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
			
 
				+
			
 
				+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
			
 
				+    """通过HTTP调用在线embedding服务"""
			
 
				+    headers = {
			
 
				+        "Content-Type": "application/json"
			
 
				+    }
			
 
				+    data = {
			
 
				+        "model": model,
			
 
				+        "input": text
			
 
				+    } 
			
 
				+    response = requests.post(
			
 
				+        VLLM_SERVER_URL,
			
 
				+        headers=headers,
			
 
				+        json=data,
			
 
				+        timeout=5  # 添加超时设置
			
 
				+    )
			
 
				+    response.raise_for_status()  # 如果状态码不是200，抛出异常
			
 
				+    result = response.json()
			
 
				+    return result["data"][0]["embedding"]
			
 
				+#####################text embedding serviceS
			
 
				+
			
 
				+#####################multi vector search
			
 
				+import numpy as np
			
 
				+from collections import defaultdict
			
 
				+from typing import List, Dict, Tuple
			
 
				+
			
 
				+###############multi vector search
			
 
				+def search_topk_multi(
			
 
				+    collection: Collection,
			
 
				+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
			
 
				+    topk: int = 2
			
 
				+) -> List[Tuple[str, float]]:
			
 
				+    """
			
 
				+    对查询向量列表检索，计算每个对象的平均最大相似度，返回 TopK 对象
			
 
				+    
			
 
				+    参数：
			
 
				+        collection: Milvus 集合实例
			
 
				+        query_vecs: 查询向量列表（每个向量维度需与集合一致）
			
 
				+        topk: 返回的 top 数量
			
 
				+    
			
 
				+    返回：
			
 
				+        排序后的列表，元素为 (object_id, 平均最大相似度)
			
 
				+    """
			
 
				+    # 步骤1：逐个检索查询向量，收集每个对象的最大相似度
			
 
				+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
			
 
				+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
			
 
				+        # 检索当前查询向量
			
 
				+        search_params = {
			
 
				+            "metric_type": "IP",
			
 
				+            "params": {"nprobe": 10}
			
 
				+        }
			
 
				+        results = collection.search(
			
 
				+            data=[q_vec],
			
 
				+            anns_field="embedding",
			
 
				+            param=search_params,
			
 
				+            limit=16384,  # Milvus最大允许的topk值
			
 
				+            output_fields=["mongo_id", "type", "path"],
			
 
				+            expr='type == "image"'  # 只检索type为text的记录
			
 
				+        )
			
 
				+        # 按 object_id 分组取最大相似度
			
 
				+        query_object_sim = defaultdict(float)
			
 
				+        for hit in results[0]:
			
 
				+            obj_id = hit.entity.get("mongo_id")
			
 
				+            sim = hit.score
			
 
				+            if sim > query_object_sim[obj_id]:
			
 
				+                query_object_sim[obj_id] = sim
			
 
				+        
			
 
				+        all_query_results.append(query_object_sim)
			
 
				+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成，覆盖 {len(query_object_sim)} 个对象")
			
 
				+    
			
 
				+    # 步骤2：计算每个对象的平均最大相似度
			
 
				+    all_object_ids = set()
			
 
				+    for res in all_query_results:
			
 
				+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
			
 
				+    
			
 
				+    object_avg_sim = {}
			
 
				+    for obj_id in all_object_ids:
			
 
				+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
			
 
				+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
			
 
				+        object_avg_sim[obj_id] = avg_sim
			
 
				+    
			
 
				+    # 步骤3：按平均相似度排序并取 TopK
			
 
				+    sorted_objects = sorted(
			
 
				+        object_avg_sim.items(),
			
 
				+        key=lambda x: x[1],
			
 
				+        reverse=True
			
 
				+    )[:topk]
			
 
				+    return sorted_objects
			
 
				+##################文本数据库 search
			
 
				+
			
 
				+###############single vector search
			
 
				+def search_topk_single(
			
 
				+    collection: Collection,
			
 
				+    query_vec: List[float],  # 查询向量
			
 
				+    topk: int = 2
			
 
				+) -> List[dict]:
			
 
				+    """
			
 
				+    对单个查询向量检索，计算每个对象的最大相似度，返回 TopK 对象
			
 
				+    
			
 
				+    参数：
			
 
				+        collection: Milvus 集合实例
			
 
				+        query_vec: 查询向量（维度需与集合一致）
			
 
				+        topk: 返回的 top 数量
			
 
				+    
			
 
				+    返回：
			
 
				+        排序后的列表，元素为 (object_id, 最大相似度)
			
 
				+    """
			
 
				+    # 步骤1：检索当前查询向量
			
 
				+    search_params = {
			
 
				+        "metric_type": "IP",
			
 
				+        "params": {"nprobe": 10}
			
 
				+    }
			
 
				+    results = collection.search(
			
 
				+        data=[query_vec],
			
 
				+        anns_field="embedding",
			
 
				+        param=search_params,
			
 
				+        limit=topk,  # Milvus最大允许的topk值
			
 
				+        output_fields=["mongo_id", "path"],
			
 
				+    )
			
 
				+    return results[0]
			
 
				+###############single vector search
			
 
				+
			
 
				+
			
 
				+search_mode = "what_search"
			
 
				+
			
 
				+if search_mode =="what_search":
			
 
				+    ##################添加what_search
			
 
				+    ########模拟计算出的embedding
			
 
				+
			
 
				+    query = '#假如食物会说话'
			
 
				+
			
 
				+    # queries_embeddings =  get_basic_embedding(text = query )
			
 
				+
			
 
				+    #########暂时代替
			
 
				+    import numpy as np
			
 
				+    q_vec = list(np.random.randn(2560))
			
 
				+    #########暂时代替
			
 
				+
			
 
				+    milvus_client = Collection(name="deconstruct_what")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_what"
			
 
				+
			
 
				+    search_params = {
			
 
				+        "metric_type": "IP",
			
 
				+        "params": {"nprobe": 10}
			
 
				+    }
			
 
				+
			
 
				+    results = search_topk_single(milvus_client,q_vec,topk= 3)
			
 
				+    print("results is ", results)
			
 
				+
			
 
				+    for i,record in enumerate(results):
			
 
				+        #########暂时代替############
			
 
				+        if record['mongo_id'] =='10000000':
			
 
				+            mongo_id = '68f894176a7850acc4851b27'
			
 
				+        else:
			
 
				+            mongo_id = record['mongo_id']
			
 
				+        #########暂时代替############
			
 
				+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
			
 
				+        print(f"第{i+1}个结果*********************：{docres}\n")
			
 
				+
			
 
				+elif search_mode =="media_search":
			
 
				+    ##################多模态search
			
 
				+    ##################
			
 
				+    queries = os.path.join("../src", "dragon_mother.jpeg")
			
 
				+    query_image = Image.open(queries)
			
 
				+    # Process the inputs
			
 
				+    batch_queries = processor.process_images([query_image]).to(model.device)
			
 
				+    # Forward pass
			
 
				+    with torch.no_grad():
			
 
				+        query_embeddings = model(**batch_queries)
			
 
				+
			
 
				+    ##################添加media_search
			
 
				+    milvus_client = Collection(name="deconstruct_media")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_media"
			
 
				+
			
 
				+    query_embeddings = query_embeddings.cpu().to(dtype=torch.float32).numpy().tolist()
			
 
				+    query_embeddings=query_embeddings[0]
			
 
				+    scores = search_topk_multi(milvus_client,query_embeddings,topk= 3)
			
 
				+
			
 
				+    print("search_topk_multi结果：",scores)
			
 
				+    ####输出结果
			
 
				+    for i,record in enumerate(scores):
			
 
				+        docres = coll.find_one({"_id": ObjectId(record[0])})
			
 
				+        print(f"第{i+1}个结果*********************：{docres}\n")
			
 
				+    
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/deconstruct_SQI/milvus_how_insert.py
+++ b/deconstruct_SQI/milvus_how_insert.py
@@ -0,0 +1,194 @@
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+from pydub import AudioSegment
			
 
				+import io
			
 
				+from scipy.io import wavfile
			
 
				+import numpy as np
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+################################连接Embedding service A
			
 
				+# 注意：根据之前的讨论，需要通过SSH隧道将远程服务转发到本地
			
 
				+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
			
 
				+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
 
				+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
			
 
				+
			
 
				+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
			
 
				+    """通过HTTP调用在线embedding服务"""
			
 
				+    headers = {
			
 
				+        "Content-Type": "application/json"
			
 
				+    }
			
 
				+    data = {
			
 
				+        "model": model,
			
 
				+        "input": text
			
 
				+    }
			
 
				+    
			
 
				+    response = requests.post(
			
 
				+        VLLM_SERVER_URL,
			
 
				+        headers=headers,
			
 
				+        json=data,
			
 
				+        timeout=5  # 添加超时设置
			
 
				+    )
			
 
				+    response.raise_for_status()  # 如果状态码不是200，抛出异常
			
 
				+    result = response.json()
			
 
				+    return result["data"][0]["embedding"]
			
 
				+
			
 
				+def parse_how_res(json_data) -> Dict[str, Dict[str, str]]:
			
 
				+    """
			
 
				+    解析 how_res.json 文件，提取两类信息：
			
 
				+    1. 所有 "how","why" 字段的 path 与 value 映射
			
 
				+    
			
 
				+    返回:
			
 
				+        {
			
 
				+            "how": {path: value, ...},
			
 
				+            "why": {path: value, ...}
			
 
				+        }
			
 
				+    """
			
 
				+    data = json_data
			
 
				+    how_dict: Dict[str, Any] = {}
			
 
				+    why_dict: Dict[str, Any] = {}
			
 
				+
			
 
				+    def traverse(obj: Any, current_path: str = ""):
			
 
				+        """递归遍历 JSON 结构，记录目标字段"""
			
 
				+        if isinstance(obj, dict):
			
 
				+            for k, v in obj.items():
			
 
				+                # 构建新路径，避免在开头添加点号
			
 
				+                new_path = f"{current_path}.{k}" if current_path else k
			
 
				+                
			
 
				+                if k == "how":
			
 
				+                    how_dict[new_path] = v
			
 
				+                elif k == "why":
			
 
				+                    why_dict[new_path] = v
			
 
				+                
			
 
				+                # 继续递归遍历
			
 
				+                traverse(v, new_path)
			
 
				+        
			
 
				+        elif isinstance(obj, list):
			
 
				+            for idx, item in enumerate(obj):
			
 
				+                # 对于数组元素，使用方括号索引
			
 
				+                new_path = f"{current_path}[{idx}]"
			
 
				+                traverse(item, new_path)
			
 
				+
			
 
				+    traverse(data)
			
 
				+    return {"how": how_dict, "why": why_dict}
			
 
				+
			
 
				+# 使用示例
			
 
				+if __name__ == "__main__":
			
 
				+    
			
 
				+    # 连接 MongoDB 数据库
			
 
				+    ##################### 存储到mongoDB
			
 
				+
			
 
				+    MONGO_URI = "mongodb://localhost:27017/"
			
 
				+    DB_NAME = "mydeconstruct"
			
 
				+    COLL_NAME = "deconstruct_how"
			
 
				+
			
 
				+    client = MongoClient(MONGO_URI)
			
 
				+    db = client[DB_NAME]
			
 
				+    coll = db[COLL_NAME]
			
 
				+
			
 
				+    # 读取并插入 JSON 文件
			
 
				+    json_path = "/home/ecs-user/project/colpali/src/how_res.json"
			
 
				+
			
 
				+    with open(json_path, "r", encoding="utf-8") as f:
			
 
				+        doc = json.load(f)
			
 
				+
			
 
				+    result = parse_how_res(doc)
			
 
				+    # print("how 字段映射：", result["how"])
			
 
				+    # print("why 字段映射：", result["why"])
			
 
				+
			
 
				+    for key, value in result["how"].items():
			
 
				+        print(f"how 字段 {key} 的值为: {value}")
			
 
				+
			
 
				+    for key, value in result["why"].items():
			
 
				+        print(f"why 字段 {key} 的值为: {value}")
			
 
				+
			
 
				+    insert_result = coll.insert_one(doc)
			
 
				+    inserted_id = insert_result.inserted_id
			
 
				+
			
 
				+    ##################### 将 result["how"] 中的每个 value 转换为向量并插入 Milvus
			
 
				+    ########## 文本向量库存一份how
			
 
				+    # 创建 Milvus 集合（如不存在）
			
 
				+    collection_name = "deconstruct_how"
			
 
				+    if not utility.has_collection(collection_name): 
			
 
				+        fields = [
			
 
				+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
			
 
				+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
			
 
				+            FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=64),
			
 
				+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
			
 
				+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
			
 
				+        ]
			
 
				+        schema = CollectionSchema(fields, description="Deconstruct how embeddings")
			
 
				+        collection = Collection(name=collection_name, schema=schema)
			
 
				+        # 创建 IVF_FLAT 索引
			
 
				+        index_params = {
			
 
				+            "metric_type": "IP",
			
 
				+            "index_type": "IVF_FLAT",
			
 
				+            "params": {"nlist": 128}
			
 
				+        }
			
 
				+        collection.create_index("embedding", index_params)
			
 
				+    else:
			
 
				+        collection = Collection(name=collection_name)
			
 
				+
			
 
				+    entities = []
			
 
				+    for key, value in result["how"].items():
			
 
				+
			
 
				+        ### 访问可达则替换
			
 
				+        # embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
			
 
				+        ###
			
 
				+        embedding = np.random.rand(2560).tolist()
			
 
				+
			
 
				+        path = key
			
 
				+        entities.append({
			
 
				+            "mongo_id": str(inserted_id),
			
 
				+            "type": "how", 
			
 
				+            "path": path,
			
 
				+            "embedding": embedding
			
 
				+        })
			
 
				+
			
 
				+    # 遍历 result["why"]，生成 embeddings 并插入 Milvus
			
 
				+    if entities:
			
 
				+        collection.insert(entities)
			
 
				+        collection.flush()
			
 
				+        print(f"已插入 {len(entities)} 条 how 字段向量到 Milvus")
			
 
				+    else:
			
 
				+        print("未找到 how 字段，未插入向量")
			
 
				+
			
 
				+    entities = []
			
 
				+    for key, value in result["why"].items():
			
 
				+
			
 
				+        # embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
			
 
				+        
			
 
				+        embedding = np.random.rand(2560).tolist()
			
 
				+
			
 
				+        path = key
			
 
				+        entities.append({
			
 
				+            "mongo_id": str(inserted_id),
			
 
				+            "type": "why", 
			
 
				+            "path": path,
			
 
				+            "embedding": embedding
			
 
				+        })
			
 
				+
			
 
				+    if entities:
			
 
				+        collection.insert(entities)
			
 
				+        collection.flush()
			
 
				+        print(f"已插入 {len(entities)} 条 why 字段向量到 Milvus")
			
 
				+    else:
			
 
				+        print("未找到 why 字段，未插入向量")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/deconstruct_SQI/milvus_how_query.py
+++ b/deconstruct_SQI/milvus_how_query.py
@@ -0,0 +1,116 @@
 
				+from ast import Import
			
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+from bson import ObjectId
			
 
				+from pydub import AudioSegment
			
 
				+import io
			
 
				+from scipy.io import wavfile
			
 
				+
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+##################### mongoDB
			
 
				+MONGO_URI = "mongodb://localhost:27017/"
			
 
				+DB_NAME = "mydeconstruct"
			
 
				+COLL_NAME = "deconstruct_how"
			
 
				+
			
 
				+client = MongoClient(MONGO_URI)
			
 
				+db = client[DB_NAME]
			
 
				+coll = db[COLL_NAME]
			
 
				+##################### mongoDB
			
 
				+
			
 
				+##################### 路径解析返回
			
 
				+def resolve_mongo_path(mongo_id: str, path: str):
			
 
				+    """
			
 
				+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串，
			
 
				+    从 MongoDB 中定位并返回对应的对象。
			
 
				+    """
			
 
				+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
			
 
				+    if not doc:
			
 
				+        return None
			
 
				+
			
 
				+    # 将路径按 '.' 分割，逐级访问
			
 
				+    parts = path.split('.')
			
 
				+    current = doc
			
 
				+    for part in parts:
			
 
				+        # 处理数组索引，如 子节点元素[0]
			
 
				+        if '[' in part and part.endswith(']'):
			
 
				+            key, idx_str = part.split('[', 1)
			
 
				+            idx = int(idx_str[:-1])  # 去掉 ']'
			
 
				+            current = current[key][idx]
			
 
				+        else:
			
 
				+            current = current[part]
			
 
				+    return current
			
 
				+##################### 路径解析返回
			
 
				+
			
 
				+search_mode ="why_search" # "why_search"
			
 
				+
			
 
				+if search_mode == "how_search":
			
 
				+    ##################query what
			
 
				+    ##################
			
 
				+    milvus_client = Collection(name="deconstruct_how")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_how"
			
 
				+    if not utility.has_collection(collection_name):
			
 
				+        print(f"no collection named {collection_name}")
			
 
				+    else:
			
 
				+        # 查询并打印 collection 中的所有记录
			
 
				+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
			
 
				+        try:
			
 
				+            # 使用 query 方法获取所有记录，不设置过滤条件
			
 
				+            all_records = milvus_client.query(
			
 
				+                expr="mongo_id >\"10000000\" and type == \"how\"",  # 空表达式表示查询所有
			
 
				+                output_fields=["mongo_id","type","path"],  # 输出所有字段
			
 
				+                limit=10  # 设置一个较大的上限，确保能获取全部
			
 
				+            )
			
 
				+            print(f"共查询到 {len(all_records)} 条记录：")
			
 
				+            for record in all_records:
			
 
				+                print(record)
			
 
				+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
			
 
				+                print("定位items：",rec)
			
 
				+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
			
 
				+                # print(docres)
			
 
				+        except Exception as e:
			
 
				+            print(f"查询失败：{e}")
			
 
				+    ##############all_records返回存储的每个record， rec返回解析后的对象
			
 
				+elif search_mode == "why_search":
			
 
				+    ##################query why
			
 
				+    ##################
			
 
				+    milvus_client = Collection(name="deconstruct_how")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_how"
			
 
				+    if not utility.has_collection(collection_name):
			
 
				+        print(f"no collection named {collection_name}")
			
 
				+    else:
			
 
				+        # 查询并打印 collection 中的所有记录
			
 
				+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
			
 
				+        try:
			
 
				+            # 使用 query 方法获取所有记录，不设置过滤条件
			
 
				+            all_records = milvus_client.query(
			
 
				+                expr="mongo_id >\"10000000\" and type == \"why\"",  # 空表达式表示查询所有
			
 
				+                output_fields=["mongo_id","type","path"],  # 输出所有字段
			
 
				+                limit=10  # 设置一个较大的上限，确保能获取全部
			
 
				+            )
			
 
				+            print(f"共查询到 {len(all_records)} 条记录：")
			
 
				+            for record in all_records:
			
 
				+                print(record)
			
 
				+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
			
 
				+                print("定位items：",rec)
			
 
				+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
			
 
				+                # print(docres)
			
 
				+        except Exception as e:
			
 
				+            print(f"查询失败：{e}")
			
 
				+    ##############all_records返回存储的每个record， rec返回解析后的对象
			
--- a/deconstruct_SQI/milvus_how_search.py
+++ b/deconstruct_SQI/milvus_how_search.py
@@ -0,0 +1,241 @@
 
				+from ast import Import
			
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+from bson import ObjectId
			
 
				+from pydub import AudioSegment
			
 
				+import io, os
			
 
				+from scipy.io import wavfile
			
 
				+import numpy
			
 
				+
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+##################################引入多模态向量模型#################
			
 
				+import torch
			
 
				+from PIL import Image
			
 
				+from transformers.utils.import_utils import is_flash_attn_2_available
			
 
				+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
			
 
				+
			
 
				+model = ColQwen2_5Omni.from_pretrained(
			
 
				+    "vidore/colqwen-omni-v0.1",
			
 
				+    torch_dtype=torch.bfloat16,
			
 
				+    device_map="cuda",  # or "mps" if on Apple Silicon
			
 
				+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
			
 
				+).eval()
			
 
				+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
			
 
				+##################################引入多模态向量模型#################
			
 
				+
			
 
				+##################### mongoDB
			
 
				+MONGO_URI = "mongodb://localhost:27017/"
			
 
				+DB_NAME = "mydeconstruct"
			
 
				+COLL_NAME = "deconstruct_how"
			
 
				+
			
 
				+client = MongoClient(MONGO_URI)
			
 
				+db = client[DB_NAME]
			
 
				+coll = db[COLL_NAME]
			
 
				+##################### mongoDB
			
 
				+
			
 
				+#####################text embedding serviceS
			
 
				+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
 
				+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
			
 
				+
			
 
				+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
			
 
				+    """通过HTTP调用在线embedding服务"""
			
 
				+    headers = {
			
 
				+        "Content-Type": "application/json"
			
 
				+    }
			
 
				+    data = {
			
 
				+        "model": model,
			
 
				+        "input": text
			
 
				+    } 
			
 
				+    response = requests.post(
			
 
				+        VLLM_SERVER_URL,
			
 
				+        headers=headers,
			
 
				+        json=data,
			
 
				+        timeout=5  # 添加超时设置
			
 
				+    )
			
 
				+    response.raise_for_status()  # 如果状态码不是200，抛出异常
			
 
				+    result = response.json()
			
 
				+    return result["data"][0]["embedding"]
			
 
				+#####################text embedding serviceS
			
 
				+#####################multi vector search
			
 
				+import numpy as np
			
 
				+from collections import defaultdict
			
 
				+from typing import List, Dict, Tuple
			
 
				+
			
 
				+###############multi vector search
			
 
				+def search_topk_multi(
			
 
				+    collection: Collection,
			
 
				+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
			
 
				+    topk: int = 2
			
 
				+) -> List[Tuple[str, float]]:
			
 
				+    """
			
 
				+    对查询向量列表检索，计算每个对象的平均最大相似度，返回 TopK 对象
			
 
				+    
			
 
				+    参数：
			
 
				+        collection: Milvus 集合实例
			
 
				+        query_vecs: 查询向量列表（每个向量维度需与集合一致）
			
 
				+        topk: 返回的 top 数量
			
 
				+    
			
 
				+    返回：
			
 
				+        排序后的列表，元素为 (object_id, 平均最大相似度)
			
 
				+    """
			
 
				+    # 步骤1：逐个检索查询向量，收集每个对象的最大相似度
			
 
				+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
			
 
				+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
			
 
				+        # 检索当前查询向量
			
 
				+        search_params = {
			
 
				+            "metric_type": "IP",
			
 
				+            "params": {"nprobe": 10}
			
 
				+        }
			
 
				+        results = collection.search(
			
 
				+            data=[q_vec],
			
 
				+            anns_field="embedding",
			
 
				+            param=search_params,
			
 
				+            limit=16384,  # Milvus最大允许的topk值
			
 
				+            output_fields=["mongo_id", "type", "path"],
			
 
				+            expr='type == "image"'  # 只检索type为text的记录
			
 
				+        )
			
 
				+        # 按 object_id 分组取最大相似度
			
 
				+        query_object_sim = defaultdict(float)
			
 
				+        for hit in results[0]:
			
 
				+            obj_id = hit.entity.get("mongo_id")
			
 
				+            sim = hit.score
			
 
				+            if sim > query_object_sim[obj_id]:
			
 
				+                query_object_sim[obj_id] = sim
			
 
				+        
			
 
				+        all_query_results.append(query_object_sim)
			
 
				+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成，覆盖 {len(query_object_sim)} 个对象")
			
 
				+    
			
 
				+    # 步骤2：计算每个对象的平均最大相似度
			
 
				+    all_object_ids = set()
			
 
				+    for res in all_query_results:
			
 
				+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
			
 
				+    
			
 
				+    object_avg_sim = {}
			
 
				+    for obj_id in all_object_ids:
			
 
				+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
			
 
				+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
			
 
				+        object_avg_sim[obj_id] = avg_sim
			
 
				+    
			
 
				+    # 步骤3：按平均相似度排序并取 TopK
			
 
				+    sorted_objects = sorted(
			
 
				+        object_avg_sim.items(),
			
 
				+        key=lambda x: x[1],
			
 
				+        reverse=True
			
 
				+    )[:topk]
			
 
				+    return sorted_objects
			
 
				+##################文本数据库 search
			
 
				+###############single vector search
			
 
				+def search_topk_single(
			
 
				+    collection: Collection,
			
 
				+    query_vec: List[float],  # 查询向量
			
 
				+    topk: int = 2,
			
 
				+    expr='type=="why"',
			
 
				+) -> List[dict]:
			
 
				+    """
			
 
				+    对单个查询向量检索，计算每个对象的最大相似度，返回 TopK 对象
			
 
				+    
			
 
				+    参数：
			
 
				+        collection: Milvus 集合实例
			
 
				+        query_vec: 查询向量（维度需与集合一致）
			
 
				+        topk: 返回的 top 数量
			
 
				+    
			
 
				+    返回：
			
 
				+        排序后的列表，元素为 (object_id, 最大相似度)
			
 
				+    """
			
 
				+    # 步骤1：检索当前查询向量
			
 
				+    search_params = {
			
 
				+        "metric_type": "IP",
			
 
				+        "params": {"nprobe": 10}
			
 
				+    }
			
 
				+    results = collection.search(
			
 
				+        data=[query_vec],
			
 
				+        anns_field="embedding",
			
 
				+        param=search_params,
			
 
				+        limit=topk,  # Milvus最大允许的topk值
			
 
				+        output_fields=["mongo_id","type","path"],
			
 
				+        expr= expr
			
 
				+    )
			
 
				+    return results[0]
			
 
				+###############single vector search
			
 
				+
			
 
				+search_mode = "how_search" #"why_search"
			
 
				+
			
 
				+if search_mode =="how_search":
			
 
				+    ##################添加what_search
			
 
				+    ########模拟计算出的embedding
			
 
				+
			
 
				+    query = '#假如食物会说话'
			
 
				+    
			
 
				+    # queries_embeddings =  get_basic_embedding(text = query )
			
 
				+    #########暂时代替
			
 
				+    import numpy as np
			
 
				+    q_vec = list(np.random.randn(2560))
			
 
				+    #########暂时代替
			
 
				+
			
 
				+    milvus_client = Collection(name="deconstruct_how")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_how"
			
 
				+
			
 
				+    search_params = {
			
 
				+        "metric_type": "IP",
			
 
				+        "params": {"nprobe": 10}
			
 
				+    }
			
 
				+
			
 
				+    results = search_topk_single(milvus_client,q_vec,topk= 3,expr = 'type=="how"')
			
 
				+    print("results is ", results)
			
 
				+    for i,record in enumerate(results):
			
 
				+        #########暂时代替############
			
 
				+        if record['mongo_id'] =='10000000':
			
 
				+            mongo_id = '68f894176a7850acc4851b27'
			
 
				+        else:
			
 
				+            mongo_id = record['mongo_id']
			
 
				+        #########暂时代替############
			
 
				+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
			
 
				+        # print(f"第{i+1}个结果*********************：{docres}\n")
			
 
				+
			
 
				+elif search_mode =="why_search":
			
 
				+    ##################添加what_search
			
 
				+    ########模拟计算出的embedding
			
 
				+    query = '#假如食物会说话'
			
 
				+    # queries_embeddings =  get_basic_embedding(text = query )
			
 
				+    #########暂时代替
			
 
				+    import numpy as np
			
 
				+    q_vec = list(np.random.randn(2560))
			
 
				+    #########暂时代替
			
 
				+
			
 
				+    milvus_client = Collection(name="deconstruct_how")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_how"
			
 
				+
			
 
				+    search_params = {
			
 
				+        "metric_type": "IP",
			
 
				+        "params": {"nprobe": 10}
			
 
				+    }
			
 
				+
			
 
				+    results = search_topk_single(milvus_client,q_vec,topk= 3,expr = 'type=="why"')
			
 
				+    print("results is ", results)
			
 
				+
			
 
				+    for i,record in enumerate(results):
			
 
				+        #########暂时代替############
			
 
				+        if record['mongo_id'] =='10000000':
			
 
				+            mongo_id = '68f894176a7850acc4851b27'
			
 
				+        else:
			
 
				+            mongo_id = record['mongo_id']
			
 
				+        #########暂时代替############
			
 
				+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
			
 
				+        print(f"第{i+1}个结果*********************：{docres}\n")
			
--- a/deconstruct_SQI/milvus_pattern_insert.py
+++ b/deconstruct_SQI/milvus_pattern_insert.py
@@ -0,0 +1,184 @@
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+from pydub import AudioSegment
			
 
				+import io
			
 
				+from scipy.io import wavfile
			
 
				+import numpy as np
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+################################连接Embedding service A
			
 
				+# 注意：根据之前的讨论，需要通过SSH隧道将远程服务转发到本地
			
 
				+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
			
 
				+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
 
				+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
			
 
				+
			
 
				+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
			
 
				+    """通过HTTP调用在线embedding服务"""
			
 
				+    headers = {
			
 
				+        "Content-Type": "application/json"
			
 
				+    }
			
 
				+    data = {
			
 
				+        "model": model,
			
 
				+        "input": text
			
 
				+    }
			
 
				+    
			
 
				+    response = requests.post(
			
 
				+        VLLM_SERVER_URL,
			
 
				+        headers=headers,
			
 
				+        json=data,
			
 
				+        timeout=5  # 添加超时设置
			
 
				+    )
			
 
				+    response.raise_for_status()  # 如果状态码不是200，抛出异常
			
 
				+    result = response.json()
			
 
				+    return result["data"][0]["embedding"]
			
 
				+
			
 
				+def parse_pattern_res(json_data) -> Dict[str, Dict[str, str]]:
			
 
				+    """
			
 
				+    解析 pattern_res.json 文件，提取两类信息：
			
 
				+    1. 所有 "模式ID","模式命名","模式说明" 字段的 path 与 value 映射
			
 
				+    
			
 
				+    返回:
			
 
				+        {
			
 
				+            "模式ID": {path: value, ...},
			
 
				+            "模式命名": {path: value, ...},
			
 
				+            "模式说明": {path: value, ...}
			
 
				+        }
			
 
				+    """
			
 
				+    data = json_data
			
 
				+    pattern_dict: Dict[str, Any] = {}
			
 
				+    def traverse(obj: Any, current_path: str = ""):
			
 
				+        """递归遍历 JSON 结构，记录目标字段"""
			
 
				+        if isinstance(obj, dict):
			
 
				+            for k, v in obj.items():
			
 
				+                # 构建新路径，避免在开头添加点号
			
 
				+                new_path = f"{current_path}.{k}" if current_path else k
			
 
				+                if k == "模式ID":
			
 
				+                    # 当遇到“模式ID”时，同时获取同层的“模式命名”和“模式描述”
			
 
				+                    temp_dict ={}
			
 
				+                    temp_dict["模式ID"] = v
			
 
				+                    temp_dict["模式命名"] = obj.get("模式命名", "")
			
 
				+                    temp_dict["模式说明"] = obj.get("模式说明", "")
			
 
				+
			
 
				+                    pattern_dict[current_path] = temp_dict
			
 
				+
			
 
				+                traverse(v, new_path)
			
 
				+        
			
 
				+        elif isinstance(obj, list):
			
 
				+            for idx, item in enumerate(obj):
			
 
				+                # 对于数组元素，使用方括号索引
			
 
				+                new_path = f"{current_path}[{idx}]"
			
 
				+                traverse(item, new_path)
			
 
				+    traverse(data)
			
 
				+    return {"pattern": pattern_dict}
			
 
				+
			
 
				+# 使用示例
			
 
				+if __name__ == "__main__":
			
 
				+    # 连接 MongoDB 数据库
			
 
				+    ##################### 存储到mongoDB
			
 
				+    MONGO_URI = "mongodb://localhost:27017/"
			
 
				+    DB_NAME = "mydeconstruct"
			
 
				+    COLL_NAME = "deconstruct_how"
			
 
				+
			
 
				+    client = MongoClient(MONGO_URI)
			
 
				+    db = client[DB_NAME]
			
 
				+    coll = db[COLL_NAME]
			
 
				+
			
 
				+    # 读取并插入 JSON 文件
			
 
				+    json_path = "/home/ecs-user/project/colpali/src/pattern_res.json"
			
 
				+
			
 
				+    with open(json_path, "r", encoding="utf-8") as f:
			
 
				+        doc = json.load(f)
			
 
				+
			
 
				+    result = parse_pattern_res(doc)
			
 
				+
			
 
				+    for key, value in result["pattern"].items():
			
 
				+        print(f"pattern 字段 {key} 的值为: {value}")
			
 
				+
			
 
				+    # exit()
			
 
				+    insert_result = coll.insert_one(doc)
			
 
				+    inserted_id = insert_result.inserted_id
			
 
				+
			
 
				+    ##################### 将 result["how"] 中的每个 value 转换为向量并插入 Milvus
			
 
				+    ########## 文本向量库存一份how
			
 
				+    # 创建 Milvus 集合（如不存在）
			
 
				+    collection_name = "deconstruct_pattern"
			
 
				+    if not utility.has_collection(collection_name): 
			
 
				+        # utility.drop_collection(collection_name)
			
 
				+        fields = [
			
 
				+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
			
 
				+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
			
 
				+            FieldSchema(name="pattern_id", dtype=DataType.VARCHAR, max_length=64),
			
 
				+            FieldSchema(name="pattern_name", dtype=DataType.VARCHAR, max_length=128),
			
 
				+            FieldSchema(name="pattern_desc", dtype=DataType.VARCHAR, max_length=2048),
			
 
				+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
			
 
				+            FieldSchema(name="name_embedding", dtype=DataType.FLOAT_VECTOR, dim=2560),
			
 
				+            FieldSchema(name="desc_embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
			
 
				+        ]
			
 
				+        schema = CollectionSchema(fields, description="Deconstruct how embeddings")
			
 
				+        collection = Collection(name=collection_name, schema=schema)
			
 
				+        # 创建 IVF_FLAT 索引
			
 
				+        index_params = {
			
 
				+            "metric_type": "IP",
			
 
				+            "index_type": "IVF_FLAT",
			
 
				+            "params": {"nlist": 128}
			
 
				+        }
			
 
				+
			
 
				+        # 为 pattern_id 字段创建字符串索引
			
 
				+        collection.create_index("pattern_id", {
			
 
				+            "index_type": "INVERTED" #"Trie"
			
 
				+        })
			
 
				+        collection.create_index("name_embedding", index_params)
			
 
				+        collection.create_index("desc_embedding", index_params)
			
 
				+    else:
			
 
				+        collection = Collection(name=collection_name)
			
 
				+
			
 
				+    entities = []
			
 
				+    for key, value in result["pattern"].items():
			
 
				+        pattern_id = value["模式ID"]
			
 
				+        pattern_name = value["模式命名"]
			
 
				+        pattern_desc = value["模式说明"]
			
 
				+
			
 
				+        ### 访问可达则替换
			
 
				+        # name_embedding = get_basic_embedding(pattern_name, model=DEFAULT_MODEL)
			
 
				+        # desc_embedding = get_basic_embedding(pattern_desc, model=DEFAULT_MODEL)
			
 
				+        ###
			
 
				+        name_embedding = np.random.rand(2560).tolist()
			
 
				+        desc_embedding = np.random.rand(2560).tolist()
			
 
				+
			
 
				+        path = key
			
 
				+        entities.append({
			
 
				+            "mongo_id": str(inserted_id),
			
 
				+            "pattern_id": pattern_id, 
			
 
				+            "pattern_name": pattern_name,
			
 
				+            "pattern_desc": pattern_desc,
			
 
				+            "path": path,
			
 
				+            "name_embedding": name_embedding,
			
 
				+            "desc_embedding": desc_embedding
			
 
				+        })
			
 
				+    # 遍历 result["pattern"]，生成 embeddings 并插入 Milvus
			
 
				+    # print("entities is ", entities)
			
 
				+    if entities:
			
 
				+        collection.insert(entities)
			
 
				+        collection.flush()
			
 
				+        print(f"已插入 {len(entities)} 条 how 字段向量到 Milvus")
			
 
				+    else:
			
 
				+        print("未找到 how 字段，未插入向量")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/deconstruct_SQI/milvus_pattern_query.py
+++ b/deconstruct_SQI/milvus_pattern_query.py
@@ -0,0 +1,87 @@
 
				+from ast import Import
			
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+from bson import ObjectId
			
 
				+from pydub import AudioSegment
			
 
				+import io
			
 
				+from scipy.io import wavfile
			
 
				+
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+##################### mongoDB
			
 
				+MONGO_URI = "mongodb://localhost:27017/"
			
 
				+DB_NAME = "mydeconstruct"
			
 
				+COLL_NAME = "deconstruct_how"
			
 
				+
			
 
				+client = MongoClient(MONGO_URI)
			
 
				+db = client[DB_NAME]
			
 
				+coll = db[COLL_NAME]
			
 
				+##################### mongoDB
			
 
				+
			
 
				+##################### 路径解析返回
			
 
				+def resolve_mongo_path(mongo_id: str, path: str):
			
 
				+    """
			
 
				+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串，
			
 
				+    从 MongoDB 中定位并返回对应的对象。
			
 
				+    """
			
 
				+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
			
 
				+    if not doc:
			
 
				+        return None
			
 
				+
			
 
				+    # 将路径按 '.' 分割，逐级访问
			
 
				+    parts = path.split('.')
			
 
				+    current = doc
			
 
				+    for part in parts:
			
 
				+        # 处理数组索引，如 子节点元素[0]
			
 
				+        if '[' in part and part.endswith(']'):
			
 
				+            key, idx_str = part.split('[', 1)
			
 
				+            idx = int(idx_str[:-1])  # 去掉 ']'
			
 
				+            current = current[key][idx]
			
 
				+        else:
			
 
				+            current = current[part]
			
 
				+    return current
			
 
				+##################### 路径解析返回
			
 
				+
			
 
				+search_mode ="id_search"  # "name_search" ,"desc_search", "id_search"
			
 
				+
			
 
				+if search_mode == "id_search":
			
 
				+    ##################query what
			
 
				+    ##################
			
 
				+    milvus_client = Collection(name="deconstruct_pattern")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_pattern"
			
 
				+    if not utility.has_collection(collection_name):
			
 
				+        print(f"no collection named {collection_name}")
			
 
				+    else:
			
 
				+        # 查询并打印 collection 中的所有记录
			
 
				+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
			
 
				+        try:
			
 
				+            # 使用 query 方法获取所有记录，不设置过滤条件
			
 
				+            all_records = milvus_client.query(
			
 
				+                expr="pattern_id =='pattern_制作-图集-形容词_2' ",  # 空表达式表示查询所有
			
 
				+                output_fields=["mongo_id","pattern_id","pattern_name","pattern_desc","path"],  # 输出所有字段
			
 
				+                limit=10  # 设置一个较大的上限，确保能获取全部
			
 
				+            )
			
 
				+            print(f"共查询到 {len(all_records)} 条记录：")
			
 
				+            for record in all_records:
			
 
				+                print(record)
			
 
				+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
			
 
				+                print("定位items：",rec)
			
 
				+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
			
 
				+                # print(docres)
			
 
				+        except Exception as e:
			
 
				+            print(f"查询失败：{e}")
			
--- a/deconstruct_SQI/milvus_pattern_search.py
+++ b/deconstruct_SQI/milvus_pattern_search.py
@@ -0,0 +1,207 @@
 
				+from ast import Import
			
 
				+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
			
 
				+import requests
			
 
				+import json
			
 
				+from typing import Dict, Any, List
			
 
				+from pymongo import MongoClient
			
 
				+from bson import ObjectId
			
 
				+from pydub import AudioSegment
			
 
				+import io, os
			
 
				+from scipy.io import wavfile
			
 
				+import numpy
			
 
				+
			
 
				+################################连接milvus数据库 A
			
 
				+# 配置信息
			
 
				+MILVUS_CONFIG = {
			
 
				+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
			
 
				+    "user": "root",
			
 
				+    "password": "Piaoquan@2025",
			
 
				+    "port": "19530",
			
 
				+}
			
 
				+print("正在连接 Milvus 数据库...")
			
 
				+connections.connect("default", **MILVUS_CONFIG)
			
 
				+print("连接成功！")
			
 
				+################################连接milvus数据库 B
			
 
				+
			
 
				+# ##################################引入多模态向量模型#################
			
 
				+# import torch
			
 
				+# from PIL import Image
			
 
				+# from transformers.utils.import_utils import is_flash_attn_2_available
			
 
				+# from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
			
 
				+
			
 
				+# model = ColQwen2_5Omni.from_pretrained(
			
 
				+#     "vidore/colqwen-omni-v0.1",
			
 
				+#     torch_dtype=torch.bfloat16,
			
 
				+#     device_map="cuda",  # or "mps" if on Apple Silicon
			
 
				+#     attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
			
 
				+# ).eval()
			
 
				+# processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
			
 
				+# ##################################引入多模态向量模型#################
			
 
				+
			
 
				+##################### mongoDB
			
 
				+MONGO_URI = "mongodb://localhost:27017/"
			
 
				+DB_NAME = "mydeconstruct"
			
 
				+COLL_NAME = "deconstruct_how"
			
 
				+
			
 
				+client = MongoClient(MONGO_URI)
			
 
				+db = client[DB_NAME]
			
 
				+coll = db[COLL_NAME]
			
 
				+##################### mongoDB
			
 
				+
			
 
				+#####################text embedding serviceS
			
 
				+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
			
 
				+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
			
 
				+
			
 
				+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
			
 
				+    """通过HTTP调用在线embedding服务"""
			
 
				+    headers = {
			
 
				+        "Content-Type": "application/json"
			
 
				+    }
			
 
				+    data = {
			
 
				+        "model": model,
			
 
				+        "input": text
			
 
				+    } 
			
 
				+    response = requests.post(
			
 
				+        VLLM_SERVER_URL,
			
 
				+        headers=headers,
			
 
				+        json=data,
			
 
				+        timeout=5  # 添加超时设置
			
 
				+    )
			
 
				+    response.raise_for_status()  # 如果状态码不是200，抛出异常
			
 
				+    result = response.json()
			
 
				+    return result["data"][0]["embedding"]
			
 
				+#####################text embedding serviceS
			
 
				+#####################multi vector search
			
 
				+import numpy as np
			
 
				+from collections import defaultdict
			
 
				+from typing import List, Dict, Tuple
			
 
				+
			
 
				+###############multi vector search
			
 
				+def search_topk_multi(
			
 
				+    collection: Collection,
			
 
				+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
			
 
				+    topk: int = 2
			
 
				+) -> List[Tuple[str, float]]:
			
 
				+    """
			
 
				+    对查询向量列表检索，计算每个对象的平均最大相似度，返回 TopK 对象
			
 
				+    
			
 
				+    参数：
			
 
				+        collection: Milvus 集合实例
			
 
				+        query_vecs: 查询向量列表（每个向量维度需与集合一致）
			
 
				+        topk: 返回的 top 数量
			
 
				+    
			
 
				+    返回：
			
 
				+        排序后的列表，元素为 (object_id, 平均最大相似度)
			
 
				+    """
			
 
				+    # 步骤1：逐个检索查询向量，收集每个对象的最大相似度
			
 
				+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
			
 
				+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
			
 
				+        # 检索当前查询向量
			
 
				+        search_params = {
			
 
				+            "metric_type": "IP",
			
 
				+            "params": {"nprobe": 10}
			
 
				+        }
			
 
				+        results = collection.search(
			
 
				+            data=[q_vec],
			
 
				+            anns_field="embedding",
			
 
				+            param=search_params,
			
 
				+            limit=16384,  # Milvus最大允许的topk值
			
 
				+            output_fields=["mongo_id", "type", "path"],
			
 
				+            expr='type == "image"'  # 只检索type为text的记录
			
 
				+        )
			
 
				+        # 按 object_id 分组取最大相似度
			
 
				+        query_object_sim = defaultdict(float)
			
 
				+        for hit in results[0]:
			
 
				+            obj_id = hit.entity.get("mongo_id")
			
 
				+            sim = hit.score
			
 
				+            if sim > query_object_sim[obj_id]:
			
 
				+                query_object_sim[obj_id] = sim
			
 
				+        
			
 
				+        all_query_results.append(query_object_sim)
			
 
				+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成，覆盖 {len(query_object_sim)} 个对象")
			
 
				+    
			
 
				+    # 步骤2：计算每个对象的平均最大相似度
			
 
				+    all_object_ids = set()
			
 
				+    for res in all_query_results:
			
 
				+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
			
 
				+    
			
 
				+    object_avg_sim = {}
			
 
				+    for obj_id in all_object_ids:
			
 
				+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
			
 
				+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
			
 
				+        object_avg_sim[obj_id] = avg_sim
			
 
				+    
			
 
				+    # 步骤3：按平均相似度排序并取 TopK
			
 
				+    sorted_objects = sorted(
			
 
				+        object_avg_sim.items(),
			
 
				+        key=lambda x: x[1],
			
 
				+        reverse=True
			
 
				+    )[:topk]
			
 
				+    return sorted_objects
			
 
				+##################文本数据库 search
			
 
				+###############single vector search
			
 
				+def search_topk_single(
			
 
				+    collection: Collection,
			
 
				+    query_vec: List[float],  # 查询向量
			
 
				+    topk: int = 2
			
 
				+) -> List[dict]:
			
 
				+    """
			
 
				+    对单个查询向量检索，计算每个对象的最大相似度，返回 TopK 对象
			
 
				+    
			
 
				+    参数：
			
 
				+        collection: Milvus 集合实例
			
 
				+        query_vec: 查询向量（维度需与集合一致）
			
 
				+        topk: 返回的 top 数量
			
 
				+    
			
 
				+    返回：
			
 
				+        排序后的列表，元素为 (object_id, 最大相似度)
			
 
				+    """
			
 
				+    # 步骤1：检索当前查询向量
			
 
				+    search_params = {
			
 
				+        "metric_type": "IP",
			
 
				+        "params": {"nprobe": 10}
			
 
				+    }
			
 
				+    results = collection.search(
			
 
				+        data=[query_vec],
			
 
				+        anns_field="name_embedding", #'desc_embedding'
			
 
				+        param=search_params,
			
 
				+        limit=topk,  # Milvus最大允许的topk值
			
 
				+        output_fields=["mongo_id","pattern_id","pattern_name","pattern_desc","path"]
			
 
				+    )
			
 
				+    return results[0]
			
 
				+###############single vector search
			
 
				+
			
 
				+search_mode = "name_search" #"desc_search"
			
 
				+
			
 
				+if search_mode =="name_search":
			
 
				+    ##################添加what_search
			
 
				+    ########模拟计算出的embedding
			
 
				+
			
 
				+    query = '#假如食物会说话'
			
 
				+    
			
 
				+    # queries_embeddings =  get_basic_embedding(text = query )
			
 
				+    #########暂时代替
			
 
				+    import numpy as np
			
 
				+    q_vec = list(np.random.randn(2560))
			
 
				+    #########暂时代替
			
 
				+
			
 
				+    milvus_client = Collection(name="deconstruct_pattern")
			
 
				+    milvus_client.load()
			
 
				+    collection_name = "deconstruct_pattern"
			
 
				+
			
 
				+    search_params = {
			
 
				+        "metric_type": "IP",
			
 
				+        "params": {"nprobe": 10}
			
 
				+    }
			
 
				+
			
 
				+    results = search_topk_single(milvus_client,q_vec,topk= 3)
			
 
				+    print("results is ", results)
			
 
				+    for i,record in enumerate(results):
			
 
				+        #########暂时代替############
			
 
				+        if record['mongo_id'] =='10000000':
			
 
				+            mongo_id = '68f894176a7850acc4851b27'
			
 
				+        else:
			
 
				+            mongo_id = record['mongo_id']
			
 
				+        #########暂时代替############
			
 
				+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
			
 
				+        # print(f"第{i+1}个结果*********************：{docres}\n")