пре 1 месец · e836747eee
--- a/deconstruct_SQI/milvus_deconstruct_insert.py
+++ b/deconstruct_SQI/milvus_deconstruct_insert.py
@@ -0,0 +1,312 @@
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+
														
 
															+from pydub import AudioSegment
														
 
															+import io
														
 
															+from scipy.io import wavfile
														
 
															+
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+##################################引入多模态模型#################
														
 
															+import torch
														
 
															+from PIL import Image
														
 
															+from transformers.utils.import_utils import is_flash_attn_2_available
														
 
															+
														
 
															+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
														
 
															+
														
 
															+model = ColQwen2_5Omni.from_pretrained(
														
 
															+    "vidore/colqwen-omni-v0.1",
														
 
															+    torch_dtype=torch.bfloat16,
														
 
															+    device_map="cuda",  # or "mps" if on Apple Silicon
														
 
															+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
														
 
															+).eval()
														
 
															+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
														
 
															+##################################引入模型#################
														
 
															+
														
 
															+################################连接Embedding service A
														
 
															+# 注意：根据之前的讨论，需要通过SSH隧道将远程服务转发到本地
														
 
															+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
														
 
															+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
														
 
															+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
														
 
															+
														
 
															+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
														
 
															+    """通过HTTP调用在线embedding服务"""
														
 
															+    headers = {
														
 
															+        "Content-Type": "application/json"
														
 
															+    }
														
 
															+    data = {
														
 
															+        "model": model,
														
 
															+        "input": text
														
 
															+    }
														
 
															+    
														
 
															+    response = requests.post(
														
 
															+        VLLM_SERVER_URL,
														
 
															+        headers=headers,
														
 
															+        json=data,
														
 
															+        timeout=5  # 添加超时设置
														
 
															+    )
														
 
															+    response.raise_for_status()  # 如果状态码不是200，抛出异常
														
 
															+    result = response.json()
														
 
															+    return result["data"][0]["embedding"]
														
 
															+
														
 
															+def get_media_embedding(query: str, type: str):
														
 
															+    '''
														
 
															+    query 是查询字符串或文件路径
														
 
															+    type 是查询类型，可选值为 "audio", "image", "video", "text"
														
 
															+    k 是返回的结果数量，默认值为 3
														
 
															+    audio image video 的query为路径
														
 
															+    text的query为问题本身
														
 
															+    '''
														
 
															+    if type =="audio":
														
 
															+        batch_queries = processor.process_audios([query]).to(model.device)
														
 
															+
														
 
															+    elif type =="image":
														
 
															+        query_image = Image.open(query)
														
 
															+        batch_queries = processor.process_images([query_image]).to(model.device)
														
 
															+    elif type =="video":
														
 
															+        batch_queries = processor.process_videos([query]).to(model.device)   
														
 
															+    elif type =="text":
														
 
															+        batch_queries = processor.process_queries([query]).to(model.device)
														
 
															+    # Forward pass
														
 
															+    with torch.no_grad():
														
 
															+        query_embeddings = model(**batch_queries)
														
 
															+    return query_embeddings
														
 
															+    # # scores = processor.score_multi_vector(query_embeddings, ds)
														
 
															+    # print("score is ", scores)
														
 
															+    # # get top-5 scores
														
 
															+    # return scores[0].topk(k).indices.tolist()
														
 
															+
														
 
															+# ################################连接Embedding service B
														
 
															+
														
 
															+def parse_deconstruct_res(json_data) -> Dict[str, Dict[str, str]]:
														
 
															+    """
														
 
															+    解析 deconstruct_res.json 文件，提取两类信息：
														
 
															+    1. 所有 "what" 字段的 path 与 value 映射
														
 
															+    2. 所有类型为 "image" 或 "video" 的媒体引用 path 与 content 值映射
														
 
															+    
														
 
															+    返回:
														
 
															+        {
														
 
															+            "what": {path: value, ...},
														
 
															+            "media": {path: value, ...}
														
 
															+        }
														
 
															+    """
														
 
															+    data = json_data
														
 
															+    what_dict: Dict[str, Any] = {}
														
 
															+    media_dict: Dict[str, Any] = {}
														
 
															+
														
 
															+    def traverse(obj: Any, current_path: str = ""):
														
 
															+        """递归遍历 JSON 结构，记录目标字段"""
														
 
															+        if isinstance(obj, dict):
														
 
															+            for k, v in obj.items():
														
 
															+                # 构建新路径，避免在开头添加点号
														
 
															+                new_path = f"{current_path}.{k}" if current_path else k
														
 
															+                
														
 
															+                if k == "what":
														
 
															+                    what_dict[new_path] = v
														
 
															+                # 处理媒体引用字段
														
 
															+                elif k == "媒体引用" and isinstance(v, list):
														
 
															+                    # 遍历媒体引用数组
														
 
															+                    for idx, media_item in enumerate(v):
														
 
															+                        if isinstance(media_item, dict) and media_item.get("type") in ("image", "video", "audio"):
														
 
															+                            # 记录content字段作为媒体路径
														
 
															+                            content = media_item.get("content")
														
 
															+                            type_nm = media_item.get("type")
														
 
															+                            if content:
														
 
															+                                # 生成正确格式的路径，如"图片元素[5].媒体引用[0].content"
														
 
															+                                media_ref_path = f"{type_nm}-{new_path}[{idx}].content"
														
 
															+                                media_dict[media_ref_path] = content
														
 
															+                
														
 
															+                # 继续递归遍历
														
 
															+                traverse(v, new_path)
														
 
															+        
														
 
															+        elif isinstance(obj, list):
														
 
															+            for idx, item in enumerate(obj):
														
 
															+                # 对于数组元素，使用方括号索引
														
 
															+                new_path = f"{current_path}[{idx}]"
														
 
															+                traverse(item, new_path)
														
 
															+
														
 
															+    traverse(data)
														
 
															+    return {"what": what_dict, "media": media_dict}
														
 
															+
														
 
															+# 使用示例
														
 
															+if __name__ == "__main__":
														
 
															+    
														
 
															+    # 连接 MongoDB 数据库
														
 
															+    ##################### 存储到mongoDB
														
 
															+
														
 
															+    MONGO_URI = "mongodb://localhost:27017/"
														
 
															+    DB_NAME = "mydeconstruct"
														
 
															+    COLL_NAME = "deconstruct"
														
 
															+
														
 
															+    client = MongoClient(MONGO_URI)
														
 
															+    db = client[DB_NAME]
														
 
															+    coll = db[COLL_NAME]
														
 
															+
														
 
															+    # 读取并插入 JSON 文件
														
 
															+    json_path = "/home/ecs-user/project/colpali/src/deconstruct_res.json"
														
 
															+
														
 
															+    with open(json_path, "r", encoding="utf-8") as f:
														
 
															+        doc = json.load(f)
														
 
															+
														
 
															+    insert_result = coll.insert_one(doc)
														
 
															+    inserted_id = insert_result.inserted_id
														
 
															+    print("已插入 MongoDB，文档 _id：", inserted_id)
														
 
															+
														
 
															+    result = parse_deconstruct_res(doc)
														
 
															+    print("what 字段映射：", result["what"])
														
 
															+    print("媒体引用映射：", result["media"])
														
 
															+
														
 
															+    ##################### 存储到mongoDB
														
 
															+
														
 
															+    ##################### 将 result["what"] 中的每个 value 转换为向量并插入 Milvus
														
 
															+    ########## 文本向量库存一份what
														
 
															+    # 创建 Milvus 集合（如不存在）
														
 
															+    collection_name = "deconstruct_what"
														
 
															+    if not utility.has_collection(collection_name): 
														
 
															+        fields = [
														
 
															+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
														
 
															+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
														
 
															+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
														
 
															+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
														
 
															+        ]
														
 
															+        schema = CollectionSchema(fields, description="Deconstruct what embeddings")
														
 
															+        collection = Collection(name=collection_name, schema=schema)
														
 
															+        # 创建 IVF_FLAT 索引
														
 
															+        index_params = {
														
 
															+            "metric_type": "IP",
														
 
															+            "index_type": "IVF_FLAT",
														
 
															+            "params": {"nlist": 128}
														
 
															+        }
														
 
															+        collection.create_index("embedding", index_params)
														
 
															+    else:
														
 
															+        collection = Collection(name=collection_name)
														
 
															+
														
 
															+    # 遍历 result["what"]，生成 embeddings 并插入 Milvus
														
 
															+    entities = []
														
 
															+    for key, value in result["what"].items():
														
 
															+        embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
														
 
															+        path = key
														
 
															+        entities.append({
														
 
															+            "mongo_id": str(inserted_id),
														
 
															+            "path": path,
														
 
															+            "embedding": embedding
														
 
															+        })
														
 
															+
														
 
															+    if entities:
														
 
															+        collection.insert(entities)
														
 
															+        collection.flush()
														
 
															+        print(f"已插入 {len(entities)} 条 what 字段向量到 Milvus")
														
 
															+    else:
														
 
															+        print("未找到 what 字段，未插入向量")
														
 
															+    ##################### 将 result["what"] 中的每个 value 转换为向量并插入 Milvus
														
 
															+
														
 
															+    #####################将 result["media"] 中的每个 value 调用多模态编码模型计算embedding并插入Milvus
														
 
															+    # 创建 Milvus 集合（如不存在）
														
 
															+    collection_name = "deconstruct_media"
														
 
															+    if not utility.has_collection(collection_name):
														
 
															+        fields = [
														
 
															+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
														
 
															+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
														
 
															+            FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=64),
														
 
															+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
														
 
															+            FieldSchema(name="no", dtype=DataType.INT32),
														
 
															+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
														
 
															+        ]
														
 
															+        schema = CollectionSchema(fields, description="Deconstruct media embeddings")
														
 
															+        collection = Collection(name=collection_name, schema=schema)
														
 
															+        # 创建 IVF_FLAT 索引
														
 
															+        index_params = {
														
 
															+            "metric_type": "IP",
														
 
															+            "index_type": "IVF_FLAT",
														
 
															+            "params": {"nlist": 128}
														
 
															+        }
														
 
															+        collection.create_index("embedding", index_params)
														
 
															+    else:
														
 
															+        collection = Collection(name=collection_name)
														
 
															+        # 遍历 result["media"]，生成 embeddings 并插入 Milvus
														
 
															+    #############存储一份media embedding到Milvus
														
 
															+    entities = []
														
 
															+    for key, value in result["media"].items():
														
 
															+        embedding = get_media_embedding(value, model=DEFAULT_MODEL)
														
 
															+        type = key[:key.index("-")]
														
 
															+        path = key[key.index("-"):]
														
 
															+        # 将 embedding 列表拆分为单条向量，并记录其在原列表中的位置 no
														
 
															+        if isinstance(embedding, list) and len(embedding) > 0:
														
 
															+            for idx, vec in enumerate(embedding):
														
 
															+                entities.append({
														
 
															+                    "mongo_id": str(inserted_id),
														
 
															+                    "type": type,
														
 
															+                    "path": path,
														
 
															+                    "no": idx,
														
 
															+                    "embedding": vec
														
 
															+                })
														
 
															+        else:
														
 
															+            # 若 embedding 不是列表或长度为 0，则 no 记为 0
														
 
															+            entities.append({
														
 
															+                "mongo_id": str(inserted_id),
														
 
															+                "type": type,
														
 
															+                "path": path,
														
 
															+                "no": 0,
														
 
															+                "embedding": embedding
														
 
															+            })
														
 
															+
														
 
															+    # 将插入操作移到循环外部，避免重复插入和数据累积
														
 
															+    if entities:
														
 
															+        collection.insert(entities)
														
 
															+        collection.flush()
														
 
															+        print(f"已插入 {len(entities)} 条 media 字段向量到 Milvus")
														
 
															+    else:
														
 
															+        print("未找到有效的 media 字段向量，未插入数据")
														
 
															+
														
 
															+    #############存储一份what 多模态embedding 到Milvus
														
 
															+    entities = []
														
 
															+    for key, value in result["what"].items():
														
 
															+        embedding = get_media_embedding(value, model=DEFAULT_MODEL)
														
 
															+        # type = key[:key.index("-")]
														
 
															+        # path = key[key.index("-"):]
														
 
															+        path = key
														
 
															+        if isinstance(embedding, list) and len(embedding) > 0:
														
 
															+            for idx, vec in enumerate(embedding):
														
 
															+                entities.append({
														
 
															+                    "mongo_id": str(inserted_id),
														
 
															+                    "type": "text",
														
 
															+                    "path": path,
														
 
															+                    "no": idx,
														
 
															+                    "embedding": vec
														
 
															+                })
														
 
															+        else:
														
 
															+            # 若 embedding 不是列表或长度为 0，则 no 记为 0
														
 
															+            entities.append({
														
 
															+                "mongo_id": str(inserted_id),
														
 
															+                "type": "text",
														
 
															+                "path": path,
														
 
															+                "no": 0,
														
 
															+                "embedding": embedding
														
 
															+            })
														
 
															+
														
 
															+    # 将插入操作移到循环外部，避免重复插入和数据累积
														
 
															+    if entities:
														
 
															+        collection.insert(entities)
														
 
															+        collection.flush()
														
 
															+        print(f"已插入 {len(entities)} 条 what 多模态向量到 Milvus")
														
 
															+    else:
														
 
															+        print("未找到有效的 what 多模态向量，未插入数据")
														
 
															+    #############存储一份what 多模态embedding 到Milvus
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
--- a/deconstruct_SQI/milvus_deconstruct_query.py
+++ b/deconstruct_SQI/milvus_deconstruct_query.py
@@ -0,0 +1,114 @@
 
															+from ast import Import
														
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+from bson import ObjectId
														
 
															+from pydub import AudioSegment
														
 
															+import io
														
 
															+from scipy.io import wavfile
														
 
															+
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+##################### mongoDB
														
 
															+MONGO_URI = "mongodb://localhost:27017/"
														
 
															+DB_NAME = "mydeconstruct"
														
 
															+COLL_NAME = "deconstruct"
														
 
															+
														
 
															+client = MongoClient(MONGO_URI)
														
 
															+db = client[DB_NAME]
														
 
															+coll = db[COLL_NAME]
														
 
															+##################### mongoDB
														
 
															+
														
 
															+##################### 路径解析返回
														
 
															+def resolve_mongo_path(mongo_id: str, path: str):
														
 
															+    """
														
 
															+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串，
														
 
															+    从 MongoDB 中定位并返回对应的对象。
														
 
															+    """
														
 
															+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
														
 
															+    if not doc:
														
 
															+        return None
														
 
															+
														
 
															+    # 将路径按 '.' 分割，逐级访问
														
 
															+    parts = path.split('.')
														
 
															+    current = doc
														
 
															+    for part in parts:
														
 
															+        # 处理数组索引，如 子节点元素[0]
														
 
															+        if '[' in part and part.endswith(']'):
														
 
															+            key, idx_str = part.split('[', 1)
														
 
															+            idx = int(idx_str[:-1])  # 去掉 ']'
														
 
															+            current = current[key][idx]
														
 
															+        else:
														
 
															+            current = current[part]
														
 
															+    return current
														
 
															+##################### 路径解析返回
														
 
															+
														
 
															+search_mode ="what_search"
														
 
															+
														
 
															+if search_mode == "what_search":
														
 
															+    ##################query what
														
 
															+    ##################
														
 
															+    milvus_client = Collection(name="deconstruct_what")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_what"
														
 
															+    if not utility.has_collection(collection_name):
														
 
															+        print(f"no collection named {collection_name}")
														
 
															+    else:
														
 
															+        # 查询并打印 collection 中的所有记录
														
 
															+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
														
 
															+        try:
														
 
															+            # 使用 query 方法获取所有记录，不设置过滤条件
														
 
															+            all_records = milvus_client.query(
														
 
															+                expr="mongo_id >\"10000000\"",  # 空表达式表示查询所有
														
 
															+                output_fields=["mongo_id","path"],  # 输出所有字段
														
 
															+                limit=10000  # 设置一个较大的上限，确保能获取全部
														
 
															+            )
														
 
															+            print(f"共查询到 {len(all_records)} 条记录：")
														
 
															+            for record in all_records:
														
 
															+                print(record)
														
 
															+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
														
 
															+                print("定位items：",rec)
														
 
															+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
														
 
															+                # print(docres)
														
 
															+        except Exception as e:
														
 
															+            print(f"查询失败：{e}")
														
 
															+    ##############all_records返回存储的每个record， rec返回解析后的对象
														
 
															+elif search_mode == "media_search":
														
 
															+    ##################query media
														
 
															+    ##################
														
 
															+    milvus_client = Collection(name="deconstruct_media")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_media"
														
 
															+    if not utility.has_collection(collection_name):
														
 
															+        print(f"no collection named {collection_name}")
														
 
															+    else:
														
 
															+        # 查询并打印 collection 中的所有记录
														
 
															+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
														
 
															+        try:
														
 
															+            # 使用 query 方法获取所有记录，不设置过滤条件
														
 
															+            all_records = milvus_client.query( 
														
 
															+                expr="type==\"text\"",  # 空表达式表示查询所有
														
 
															+                output_fields=["mongo_id","path","type"],  # 输出所有字段
														
 
															+                limit=100  # 设置一个较大的上限，确保能获取全部
														
 
															+            )
														
 
															+            print(f"共查询到 {len(all_records)} 条记录：")
														
 
															+            for record in all_records:
														
 
															+                print(record)
														
 
															+                docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
														
 
															+                print(docres)
														
 
															+        except Exception as e:
														
 
															+            print(f"查询失败：{e}")
														
 
															+
														
--- a/deconstruct_SQI/milvus_deconstruct_search.py
+++ b/deconstruct_SQI/milvus_deconstruct_search.py
@@ -0,0 +1,243 @@
 
															+from ast import Import
														
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+from bson import ObjectId
														
 
															+from pydub import AudioSegment
														
 
															+import io, os
														
 
															+from scipy.io import wavfile
														
 
															+import numpy
														
 
															+
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+##################################引入多模态向量模型#################
														
 
															+import torch
														
 
															+from PIL import Image
														
 
															+from transformers.utils.import_utils import is_flash_attn_2_available
														
 
															+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
														
 
															+
														
 
															+model = ColQwen2_5Omni.from_pretrained(
														
 
															+    "vidore/colqwen-omni-v0.1",
														
 
															+    torch_dtype=torch.bfloat16,
														
 
															+    device_map="cuda",  # or "mps" if on Apple Silicon
														
 
															+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
														
 
															+).eval()
														
 
															+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
														
 
															+##################################引入多模态向量模型#################
														
 
															+
														
 
															+##################### mongoDB
														
 
															+MONGO_URI = "mongodb://localhost:27017/"
														
 
															+DB_NAME = "mydeconstruct"
														
 
															+COLL_NAME = "deconstruct"
														
 
															+
														
 
															+client = MongoClient(MONGO_URI)
														
 
															+db = client[DB_NAME]
														
 
															+coll = db[COLL_NAME]
														
 
															+##################### mongoDB
														
 
															+
														
 
															+#####################text embedding serviceS
														
 
															+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
														
 
															+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
														
 
															+
														
 
															+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
														
 
															+    """通过HTTP调用在线embedding服务"""
														
 
															+    headers = {
														
 
															+        "Content-Type": "application/json"
														
 
															+    }
														
 
															+    data = {
														
 
															+        "model": model,
														
 
															+        "input": text
														
 
															+    } 
														
 
															+    response = requests.post(
														
 
															+        VLLM_SERVER_URL,
														
 
															+        headers=headers,
														
 
															+        json=data,
														
 
															+        timeout=5  # 添加超时设置
														
 
															+    )
														
 
															+    response.raise_for_status()  # 如果状态码不是200，抛出异常
														
 
															+    result = response.json()
														
 
															+    return result["data"][0]["embedding"]
														
 
															+#####################text embedding serviceS
														
 
															+
														
 
															+#####################multi vector search
														
 
															+import numpy as np
														
 
															+from collections import defaultdict
														
 
															+from typing import List, Dict, Tuple
														
 
															+
														
 
															+###############multi vector search
														
 
															+def search_topk_multi(
														
 
															+    collection: Collection,
														
 
															+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
														
 
															+    topk: int = 2
														
 
															+) -> List[Tuple[str, float]]:
														
 
															+    """
														
 
															+    对查询向量列表检索，计算每个对象的平均最大相似度，返回 TopK 对象
														
 
															+    
														
 
															+    参数：
														
 
															+        collection: Milvus 集合实例
														
 
															+        query_vecs: 查询向量列表（每个向量维度需与集合一致）
														
 
															+        topk: 返回的 top 数量
														
 
															+    
														
 
															+    返回：
														
 
															+        排序后的列表，元素为 (object_id, 平均最大相似度)
														
 
															+    """
														
 
															+    # 步骤1：逐个检索查询向量，收集每个对象的最大相似度
														
 
															+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
														
 
															+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
														
 
															+        # 检索当前查询向量
														
 
															+        search_params = {
														
 
															+            "metric_type": "IP",
														
 
															+            "params": {"nprobe": 10}
														
 
															+        }
														
 
															+        results = collection.search(
														
 
															+            data=[q_vec],
														
 
															+            anns_field="embedding",
														
 
															+            param=search_params,
														
 
															+            limit=16384,  # Milvus最大允许的topk值
														
 
															+            output_fields=["mongo_id", "type", "path"],
														
 
															+            expr='type == "image"'  # 只检索type为text的记录
														
 
															+        )
														
 
															+        # 按 object_id 分组取最大相似度
														
 
															+        query_object_sim = defaultdict(float)
														
 
															+        for hit in results[0]:
														
 
															+            obj_id = hit.entity.get("mongo_id")
														
 
															+            sim = hit.score
														
 
															+            if sim > query_object_sim[obj_id]:
														
 
															+                query_object_sim[obj_id] = sim
														
 
															+        
														
 
															+        all_query_results.append(query_object_sim)
														
 
															+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成，覆盖 {len(query_object_sim)} 个对象")
														
 
															+    
														
 
															+    # 步骤2：计算每个对象的平均最大相似度
														
 
															+    all_object_ids = set()
														
 
															+    for res in all_query_results:
														
 
															+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
														
 
															+    
														
 
															+    object_avg_sim = {}
														
 
															+    for obj_id in all_object_ids:
														
 
															+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
														
 
															+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
														
 
															+        object_avg_sim[obj_id] = avg_sim
														
 
															+    
														
 
															+    # 步骤3：按平均相似度排序并取 TopK
														
 
															+    sorted_objects = sorted(
														
 
															+        object_avg_sim.items(),
														
 
															+        key=lambda x: x[1],
														
 
															+        reverse=True
														
 
															+    )[:topk]
														
 
															+    return sorted_objects
														
 
															+##################文本数据库 search
														
 
															+
														
 
															+###############single vector search
														
 
															+def search_topk_single(
														
 
															+    collection: Collection,
														
 
															+    query_vec: List[float],  # 查询向量
														
 
															+    topk: int = 2
														
 
															+) -> List[dict]:
														
 
															+    """
														
 
															+    对单个查询向量检索，计算每个对象的最大相似度，返回 TopK 对象
														
 
															+    
														
 
															+    参数：
														
 
															+        collection: Milvus 集合实例
														
 
															+        query_vec: 查询向量（维度需与集合一致）
														
 
															+        topk: 返回的 top 数量
														
 
															+    
														
 
															+    返回：
														
 
															+        排序后的列表，元素为 (object_id, 最大相似度)
														
 
															+    """
														
 
															+    # 步骤1：检索当前查询向量
														
 
															+    search_params = {
														
 
															+        "metric_type": "IP",
														
 
															+        "params": {"nprobe": 10}
														
 
															+    }
														
 
															+    results = collection.search(
														
 
															+        data=[query_vec],
														
 
															+        anns_field="embedding",
														
 
															+        param=search_params,
														
 
															+        limit=topk,  # Milvus最大允许的topk值
														
 
															+        output_fields=["mongo_id", "path"],
														
 
															+    )
														
 
															+    return results[0]
														
 
															+###############single vector search
														
 
															+
														
 
															+
														
 
															+search_mode = "what_search"
														
 
															+
														
 
															+if search_mode =="what_search":
														
 
															+    ##################添加what_search
														
 
															+    ########模拟计算出的embedding
														
 
															+
														
 
															+    query = '#假如食物会说话'
														
 
															+
														
 
															+    # queries_embeddings =  get_basic_embedding(text = query )
														
 
															+
														
 
															+    #########暂时代替
														
 
															+    import numpy as np
														
 
															+    q_vec = list(np.random.randn(2560))
														
 
															+    #########暂时代替
														
 
															+
														
 
															+    milvus_client = Collection(name="deconstruct_what")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_what"
														
 
															+
														
 
															+    search_params = {
														
 
															+        "metric_type": "IP",
														
 
															+        "params": {"nprobe": 10}
														
 
															+    }
														
 
															+
														
 
															+    results = search_topk_single(milvus_client,q_vec,topk= 3)
														
 
															+    print("results is ", results)
														
 
															+
														
 
															+    for i,record in enumerate(results):
														
 
															+        #########暂时代替############
														
 
															+        if record['mongo_id'] =='10000000':
														
 
															+            mongo_id = '68f894176a7850acc4851b27'
														
 
															+        else:
														
 
															+            mongo_id = record['mongo_id']
														
 
															+        #########暂时代替############
														
 
															+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
														
 
															+        print(f"第{i+1}个结果*********************：{docres}\n")
														
 
															+
														
 
															+elif search_mode =="media_search":
														
 
															+    ##################多模态search
														
 
															+    ##################
														
 
															+    queries = os.path.join("../src", "dragon_mother.jpeg")
														
 
															+    query_image = Image.open(queries)
														
 
															+    # Process the inputs
														
 
															+    batch_queries = processor.process_images([query_image]).to(model.device)
														
 
															+    # Forward pass
														
 
															+    with torch.no_grad():
														
 
															+        query_embeddings = model(**batch_queries)
														
 
															+
														
 
															+    ##################添加media_search
														
 
															+    milvus_client = Collection(name="deconstruct_media")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_media"
														
 
															+
														
 
															+    query_embeddings = query_embeddings.cpu().to(dtype=torch.float32).numpy().tolist()
														
 
															+    query_embeddings=query_embeddings[0]
														
 
															+    scores = search_topk_multi(milvus_client,query_embeddings,topk= 3)
														
 
															+
														
 
															+    print("search_topk_multi结果：",scores)
														
 
															+    ####输出结果
														
 
															+    for i,record in enumerate(scores):
														
 
															+        docres = coll.find_one({"_id": ObjectId(record[0])})
														
 
															+        print(f"第{i+1}个结果*********************：{docres}\n")
														
 
															+    
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
--- a/deconstruct_SQI/milvus_how_insert.py
+++ b/deconstruct_SQI/milvus_how_insert.py
@@ -0,0 +1,194 @@
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+
														
 
															+from pydub import AudioSegment
														
 
															+import io
														
 
															+from scipy.io import wavfile
														
 
															+import numpy as np
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+################################连接Embedding service A
														
 
															+# 注意：根据之前的讨论，需要通过SSH隧道将远程服务转发到本地
														
 
															+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
														
 
															+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
														
 
															+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
														
 
															+
														
 
															+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
														
 
															+    """通过HTTP调用在线embedding服务"""
														
 
															+    headers = {
														
 
															+        "Content-Type": "application/json"
														
 
															+    }
														
 
															+    data = {
														
 
															+        "model": model,
														
 
															+        "input": text
														
 
															+    }
														
 
															+    
														
 
															+    response = requests.post(
														
 
															+        VLLM_SERVER_URL,
														
 
															+        headers=headers,
														
 
															+        json=data,
														
 
															+        timeout=5  # 添加超时设置
														
 
															+    )
														
 
															+    response.raise_for_status()  # 如果状态码不是200，抛出异常
														
 
															+    result = response.json()
														
 
															+    return result["data"][0]["embedding"]
														
 
															+
														
 
															+def parse_how_res(json_data) -> Dict[str, Dict[str, str]]:
														
 
															+    """
														
 
															+    解析 how_res.json 文件，提取两类信息：
														
 
															+    1. 所有 "how","why" 字段的 path 与 value 映射
														
 
															+    
														
 
															+    返回:
														
 
															+        {
														
 
															+            "how": {path: value, ...},
														
 
															+            "why": {path: value, ...}
														
 
															+        }
														
 
															+    """
														
 
															+    data = json_data
														
 
															+    how_dict: Dict[str, Any] = {}
														
 
															+    why_dict: Dict[str, Any] = {}
														
 
															+
														
 
															+    def traverse(obj: Any, current_path: str = ""):
														
 
															+        """递归遍历 JSON 结构，记录目标字段"""
														
 
															+        if isinstance(obj, dict):
														
 
															+            for k, v in obj.items():
														
 
															+                # 构建新路径，避免在开头添加点号
														
 
															+                new_path = f"{current_path}.{k}" if current_path else k
														
 
															+                
														
 
															+                if k == "how":
														
 
															+                    how_dict[new_path] = v
														
 
															+                elif k == "why":
														
 
															+                    why_dict[new_path] = v
														
 
															+                
														
 
															+                # 继续递归遍历
														
 
															+                traverse(v, new_path)
														
 
															+        
														
 
															+        elif isinstance(obj, list):
														
 
															+            for idx, item in enumerate(obj):
														
 
															+                # 对于数组元素，使用方括号索引
														
 
															+                new_path = f"{current_path}[{idx}]"
														
 
															+                traverse(item, new_path)
														
 
															+
														
 
															+    traverse(data)
														
 
															+    return {"how": how_dict, "why": why_dict}
														
 
															+
														
 
															+# 使用示例
														
 
															+if __name__ == "__main__":
														
 
															+    
														
 
															+    # 连接 MongoDB 数据库
														
 
															+    ##################### 存储到mongoDB
														
 
															+
														
 
															+    MONGO_URI = "mongodb://localhost:27017/"
														
 
															+    DB_NAME = "mydeconstruct"
														
 
															+    COLL_NAME = "deconstruct_how"
														
 
															+
														
 
															+    client = MongoClient(MONGO_URI)
														
 
															+    db = client[DB_NAME]
														
 
															+    coll = db[COLL_NAME]
														
 
															+
														
 
															+    # 读取并插入 JSON 文件
														
 
															+    json_path = "/home/ecs-user/project/colpali/src/how_res.json"
														
 
															+
														
 
															+    with open(json_path, "r", encoding="utf-8") as f:
														
 
															+        doc = json.load(f)
														
 
															+
														
 
															+    result = parse_how_res(doc)
														
 
															+    # print("how 字段映射：", result["how"])
														
 
															+    # print("why 字段映射：", result["why"])
														
 
															+
														
 
															+    for key, value in result["how"].items():
														
 
															+        print(f"how 字段 {key} 的值为: {value}")
														
 
															+
														
 
															+    for key, value in result["why"].items():
														
 
															+        print(f"why 字段 {key} 的值为: {value}")
														
 
															+
														
 
															+    insert_result = coll.insert_one(doc)
														
 
															+    inserted_id = insert_result.inserted_id
														
 
															+
														
 
															+    ##################### 将 result["how"] 中的每个 value 转换为向量并插入 Milvus
														
 
															+    ########## 文本向量库存一份how
														
 
															+    # 创建 Milvus 集合（如不存在）
														
 
															+    collection_name = "deconstruct_how"
														
 
															+    if not utility.has_collection(collection_name): 
														
 
															+        fields = [
														
 
															+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
														
 
															+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
														
 
															+            FieldSchema(name="type", dtype=DataType.VARCHAR, max_length=64),
														
 
															+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
														
 
															+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
														
 
															+        ]
														
 
															+        schema = CollectionSchema(fields, description="Deconstruct how embeddings")
														
 
															+        collection = Collection(name=collection_name, schema=schema)
														
 
															+        # 创建 IVF_FLAT 索引
														
 
															+        index_params = {
														
 
															+            "metric_type": "IP",
														
 
															+            "index_type": "IVF_FLAT",
														
 
															+            "params": {"nlist": 128}
														
 
															+        }
														
 
															+        collection.create_index("embedding", index_params)
														
 
															+    else:
														
 
															+        collection = Collection(name=collection_name)
														
 
															+
														
 
															+    entities = []
														
 
															+    for key, value in result["how"].items():
														
 
															+
														
 
															+        ### 访问可达则替换
														
 
															+        # embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
														
 
															+        ###
														
 
															+        embedding = np.random.rand(2560).tolist()
														
 
															+
														
 
															+        path = key
														
 
															+        entities.append({
														
 
															+            "mongo_id": str(inserted_id),
														
 
															+            "type": "how", 
														
 
															+            "path": path,
														
 
															+            "embedding": embedding
														
 
															+        })
														
 
															+
														
 
															+    # 遍历 result["why"]，生成 embeddings 并插入 Milvus
														
 
															+    if entities:
														
 
															+        collection.insert(entities)
														
 
															+        collection.flush()
														
 
															+        print(f"已插入 {len(entities)} 条 how 字段向量到 Milvus")
														
 
															+    else:
														
 
															+        print("未找到 how 字段，未插入向量")
														
 
															+
														
 
															+    entities = []
														
 
															+    for key, value in result["why"].items():
														
 
															+
														
 
															+        # embedding = get_basic_embedding(value, model=DEFAULT_MODEL)
														
 
															+        
														
 
															+        embedding = np.random.rand(2560).tolist()
														
 
															+
														
 
															+        path = key
														
 
															+        entities.append({
														
 
															+            "mongo_id": str(inserted_id),
														
 
															+            "type": "why", 
														
 
															+            "path": path,
														
 
															+            "embedding": embedding
														
 
															+        })
														
 
															+
														
 
															+    if entities:
														
 
															+        collection.insert(entities)
														
 
															+        collection.flush()
														
 
															+        print(f"已插入 {len(entities)} 条 why 字段向量到 Milvus")
														
 
															+    else:
														
 
															+        print("未找到 why 字段，未插入向量")
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
--- a/deconstruct_SQI/milvus_how_query.py
+++ b/deconstruct_SQI/milvus_how_query.py
@@ -0,0 +1,116 @@
 
															+from ast import Import
														
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+from bson import ObjectId
														
 
															+from pydub import AudioSegment
														
 
															+import io
														
 
															+from scipy.io import wavfile
														
 
															+
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+##################### mongoDB
														
 
															+MONGO_URI = "mongodb://localhost:27017/"
														
 
															+DB_NAME = "mydeconstruct"
														
 
															+COLL_NAME = "deconstruct_how"
														
 
															+
														
 
															+client = MongoClient(MONGO_URI)
														
 
															+db = client[DB_NAME]
														
 
															+coll = db[COLL_NAME]
														
 
															+##################### mongoDB
														
 
															+
														
 
															+##################### 路径解析返回
														
 
															+def resolve_mongo_path(mongo_id: str, path: str):
														
 
															+    """
														
 
															+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串，
														
 
															+    从 MongoDB 中定位并返回对应的对象。
														
 
															+    """
														
 
															+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
														
 
															+    if not doc:
														
 
															+        return None
														
 
															+
														
 
															+    # 将路径按 '.' 分割，逐级访问
														
 
															+    parts = path.split('.')
														
 
															+    current = doc
														
 
															+    for part in parts:
														
 
															+        # 处理数组索引，如 子节点元素[0]
														
 
															+        if '[' in part and part.endswith(']'):
														
 
															+            key, idx_str = part.split('[', 1)
														
 
															+            idx = int(idx_str[:-1])  # 去掉 ']'
														
 
															+            current = current[key][idx]
														
 
															+        else:
														
 
															+            current = current[part]
														
 
															+    return current
														
 
															+##################### 路径解析返回
														
 
															+
														
 
															+search_mode ="why_search" # "why_search"
														
 
															+
														
 
															+if search_mode == "how_search":
														
 
															+    ##################query what
														
 
															+    ##################
														
 
															+    milvus_client = Collection(name="deconstruct_how")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_how"
														
 
															+    if not utility.has_collection(collection_name):
														
 
															+        print(f"no collection named {collection_name}")
														
 
															+    else:
														
 
															+        # 查询并打印 collection 中的所有记录
														
 
															+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
														
 
															+        try:
														
 
															+            # 使用 query 方法获取所有记录，不设置过滤条件
														
 
															+            all_records = milvus_client.query(
														
 
															+                expr="mongo_id >\"10000000\" and type == \"how\"",  # 空表达式表示查询所有
														
 
															+                output_fields=["mongo_id","type","path"],  # 输出所有字段
														
 
															+                limit=10  # 设置一个较大的上限，确保能获取全部
														
 
															+            )
														
 
															+            print(f"共查询到 {len(all_records)} 条记录：")
														
 
															+            for record in all_records:
														
 
															+                print(record)
														
 
															+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
														
 
															+                print("定位items：",rec)
														
 
															+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
														
 
															+                # print(docres)
														
 
															+        except Exception as e:
														
 
															+            print(f"查询失败：{e}")
														
 
															+    ##############all_records返回存储的每个record， rec返回解析后的对象
														
 
															+elif search_mode == "why_search":
														
 
															+    ##################query why
														
 
															+    ##################
														
 
															+    milvus_client = Collection(name="deconstruct_how")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_how"
														
 
															+    if not utility.has_collection(collection_name):
														
 
															+        print(f"no collection named {collection_name}")
														
 
															+    else:
														
 
															+        # 查询并打印 collection 中的所有记录
														
 
															+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
														
 
															+        try:
														
 
															+            # 使用 query 方法获取所有记录，不设置过滤条件
														
 
															+            all_records = milvus_client.query(
														
 
															+                expr="mongo_id >\"10000000\" and type == \"why\"",  # 空表达式表示查询所有
														
 
															+                output_fields=["mongo_id","type","path"],  # 输出所有字段
														
 
															+                limit=10  # 设置一个较大的上限，确保能获取全部
														
 
															+            )
														
 
															+            print(f"共查询到 {len(all_records)} 条记录：")
														
 
															+            for record in all_records:
														
 
															+                print(record)
														
 
															+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
														
 
															+                print("定位items：",rec)
														
 
															+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
														
 
															+                # print(docres)
														
 
															+        except Exception as e:
														
 
															+            print(f"查询失败：{e}")
														
 
															+    ##############all_records返回存储的每个record， rec返回解析后的对象
														
--- a/deconstruct_SQI/milvus_how_search.py
+++ b/deconstruct_SQI/milvus_how_search.py
@@ -0,0 +1,241 @@
 
															+from ast import Import
														
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+from bson import ObjectId
														
 
															+from pydub import AudioSegment
														
 
															+import io, os
														
 
															+from scipy.io import wavfile
														
 
															+import numpy
														
 
															+
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+##################################引入多模态向量模型#################
														
 
															+import torch
														
 
															+from PIL import Image
														
 
															+from transformers.utils.import_utils import is_flash_attn_2_available
														
 
															+from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
														
 
															+
														
 
															+model = ColQwen2_5Omni.from_pretrained(
														
 
															+    "vidore/colqwen-omni-v0.1",
														
 
															+    torch_dtype=torch.bfloat16,
														
 
															+    device_map="cuda",  # or "mps" if on Apple Silicon
														
 
															+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
														
 
															+).eval()
														
 
															+processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
														
 
															+##################################引入多模态向量模型#################
														
 
															+
														
 
															+##################### mongoDB
														
 
															+MONGO_URI = "mongodb://localhost:27017/"
														
 
															+DB_NAME = "mydeconstruct"
														
 
															+COLL_NAME = "deconstruct_how"
														
 
															+
														
 
															+client = MongoClient(MONGO_URI)
														
 
															+db = client[DB_NAME]
														
 
															+coll = db[COLL_NAME]
														
 
															+##################### mongoDB
														
 
															+
														
 
															+#####################text embedding serviceS
														
 
															+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
														
 
															+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
														
 
															+
														
 
															+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
														
 
															+    """通过HTTP调用在线embedding服务"""
														
 
															+    headers = {
														
 
															+        "Content-Type": "application/json"
														
 
															+    }
														
 
															+    data = {
														
 
															+        "model": model,
														
 
															+        "input": text
														
 
															+    } 
														
 
															+    response = requests.post(
														
 
															+        VLLM_SERVER_URL,
														
 
															+        headers=headers,
														
 
															+        json=data,
														
 
															+        timeout=5  # 添加超时设置
														
 
															+    )
														
 
															+    response.raise_for_status()  # 如果状态码不是200，抛出异常
														
 
															+    result = response.json()
														
 
															+    return result["data"][0]["embedding"]
														
 
															+#####################text embedding serviceS
														
 
															+#####################multi vector search
														
 
															+import numpy as np
														
 
															+from collections import defaultdict
														
 
															+from typing import List, Dict, Tuple
														
 
															+
														
 
															+###############multi vector search
														
 
															+def search_topk_multi(
														
 
															+    collection: Collection,
														
 
															+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
														
 
															+    topk: int = 2
														
 
															+) -> List[Tuple[str, float]]:
														
 
															+    """
														
 
															+    对查询向量列表检索，计算每个对象的平均最大相似度，返回 TopK 对象
														
 
															+    
														
 
															+    参数：
														
 
															+        collection: Milvus 集合实例
														
 
															+        query_vecs: 查询向量列表（每个向量维度需与集合一致）
														
 
															+        topk: 返回的 top 数量
														
 
															+    
														
 
															+    返回：
														
 
															+        排序后的列表，元素为 (object_id, 平均最大相似度)
														
 
															+    """
														
 
															+    # 步骤1：逐个检索查询向量，收集每个对象的最大相似度
														
 
															+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
														
 
															+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
														
 
															+        # 检索当前查询向量
														
 
															+        search_params = {
														
 
															+            "metric_type": "IP",
														
 
															+            "params": {"nprobe": 10}
														
 
															+        }
														
 
															+        results = collection.search(
														
 
															+            data=[q_vec],
														
 
															+            anns_field="embedding",
														
 
															+            param=search_params,
														
 
															+            limit=16384,  # Milvus最大允许的topk值
														
 
															+            output_fields=["mongo_id", "type", "path"],
														
 
															+            expr='type == "image"'  # 只检索type为text的记录
														
 
															+        )
														
 
															+        # 按 object_id 分组取最大相似度
														
 
															+        query_object_sim = defaultdict(float)
														
 
															+        for hit in results[0]:
														
 
															+            obj_id = hit.entity.get("mongo_id")
														
 
															+            sim = hit.score
														
 
															+            if sim > query_object_sim[obj_id]:
														
 
															+                query_object_sim[obj_id] = sim
														
 
															+        
														
 
															+        all_query_results.append(query_object_sim)
														
 
															+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成，覆盖 {len(query_object_sim)} 个对象")
														
 
															+    
														
 
															+    # 步骤2：计算每个对象的平均最大相似度
														
 
															+    all_object_ids = set()
														
 
															+    for res in all_query_results:
														
 
															+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
														
 
															+    
														
 
															+    object_avg_sim = {}
														
 
															+    for obj_id in all_object_ids:
														
 
															+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
														
 
															+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
														
 
															+        object_avg_sim[obj_id] = avg_sim
														
 
															+    
														
 
															+    # 步骤3：按平均相似度排序并取 TopK
														
 
															+    sorted_objects = sorted(
														
 
															+        object_avg_sim.items(),
														
 
															+        key=lambda x: x[1],
														
 
															+        reverse=True
														
 
															+    )[:topk]
														
 
															+    return sorted_objects
														
 
															+##################文本数据库 search
														
 
															+###############single vector search
														
 
															+def search_topk_single(
														
 
															+    collection: Collection,
														
 
															+    query_vec: List[float],  # 查询向量
														
 
															+    topk: int = 2,
														
 
															+    expr='type=="why"',
														
 
															+) -> List[dict]:
														
 
															+    """
														
 
															+    对单个查询向量检索，计算每个对象的最大相似度，返回 TopK 对象
														
 
															+    
														
 
															+    参数：
														
 
															+        collection: Milvus 集合实例
														
 
															+        query_vec: 查询向量（维度需与集合一致）
														
 
															+        topk: 返回的 top 数量
														
 
															+    
														
 
															+    返回：
														
 
															+        排序后的列表，元素为 (object_id, 最大相似度)
														
 
															+    """
														
 
															+    # 步骤1：检索当前查询向量
														
 
															+    search_params = {
														
 
															+        "metric_type": "IP",
														
 
															+        "params": {"nprobe": 10}
														
 
															+    }
														
 
															+    results = collection.search(
														
 
															+        data=[query_vec],
														
 
															+        anns_field="embedding",
														
 
															+        param=search_params,
														
 
															+        limit=topk,  # Milvus最大允许的topk值
														
 
															+        output_fields=["mongo_id","type","path"],
														
 
															+        expr= expr
														
 
															+    )
														
 
															+    return results[0]
														
 
															+###############single vector search
														
 
															+
														
 
															+search_mode = "how_search" #"why_search"
														
 
															+
														
 
															+if search_mode =="how_search":
														
 
															+    ##################添加what_search
														
 
															+    ########模拟计算出的embedding
														
 
															+
														
 
															+    query = '#假如食物会说话'
														
 
															+    
														
 
															+    # queries_embeddings =  get_basic_embedding(text = query )
														
 
															+    #########暂时代替
														
 
															+    import numpy as np
														
 
															+    q_vec = list(np.random.randn(2560))
														
 
															+    #########暂时代替
														
 
															+
														
 
															+    milvus_client = Collection(name="deconstruct_how")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_how"
														
 
															+
														
 
															+    search_params = {
														
 
															+        "metric_type": "IP",
														
 
															+        "params": {"nprobe": 10}
														
 
															+    }
														
 
															+
														
 
															+    results = search_topk_single(milvus_client,q_vec,topk= 3,expr = 'type=="how"')
														
 
															+    print("results is ", results)
														
 
															+    for i,record in enumerate(results):
														
 
															+        #########暂时代替############
														
 
															+        if record['mongo_id'] =='10000000':
														
 
															+            mongo_id = '68f894176a7850acc4851b27'
														
 
															+        else:
														
 
															+            mongo_id = record['mongo_id']
														
 
															+        #########暂时代替############
														
 
															+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
														
 
															+        # print(f"第{i+1}个结果*********************：{docres}\n")
														
 
															+
														
 
															+elif search_mode =="why_search":
														
 
															+    ##################添加what_search
														
 
															+    ########模拟计算出的embedding
														
 
															+    query = '#假如食物会说话'
														
 
															+    # queries_embeddings =  get_basic_embedding(text = query )
														
 
															+    #########暂时代替
														
 
															+    import numpy as np
														
 
															+    q_vec = list(np.random.randn(2560))
														
 
															+    #########暂时代替
														
 
															+
														
 
															+    milvus_client = Collection(name="deconstruct_how")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_how"
														
 
															+
														
 
															+    search_params = {
														
 
															+        "metric_type": "IP",
														
 
															+        "params": {"nprobe": 10}
														
 
															+    }
														
 
															+
														
 
															+    results = search_topk_single(milvus_client,q_vec,topk= 3,expr = 'type=="why"')
														
 
															+    print("results is ", results)
														
 
															+
														
 
															+    for i,record in enumerate(results):
														
 
															+        #########暂时代替############
														
 
															+        if record['mongo_id'] =='10000000':
														
 
															+            mongo_id = '68f894176a7850acc4851b27'
														
 
															+        else:
														
 
															+            mongo_id = record['mongo_id']
														
 
															+        #########暂时代替############
														
 
															+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
														
 
															+        print(f"第{i+1}个结果*********************：{docres}\n")
														
--- a/deconstruct_SQI/milvus_pattern_insert.py
+++ b/deconstruct_SQI/milvus_pattern_insert.py
@@ -0,0 +1,184 @@
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+
														
 
															+from pydub import AudioSegment
														
 
															+import io
														
 
															+from scipy.io import wavfile
														
 
															+import numpy as np
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+################################连接Embedding service A
														
 
															+# 注意：根据之前的讨论，需要通过SSH隧道将远程服务转发到本地
														
 
															+# 在本地机器上执行: ssh -R 8000:192.168.100.31:8000 username@server_ip
														
 
															+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
														
 
															+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
														
 
															+
														
 
															+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
														
 
															+    """通过HTTP调用在线embedding服务"""
														
 
															+    headers = {
														
 
															+        "Content-Type": "application/json"
														
 
															+    }
														
 
															+    data = {
														
 
															+        "model": model,
														
 
															+        "input": text
														
 
															+    }
														
 
															+    
														
 
															+    response = requests.post(
														
 
															+        VLLM_SERVER_URL,
														
 
															+        headers=headers,
														
 
															+        json=data,
														
 
															+        timeout=5  # 添加超时设置
														
 
															+    )
														
 
															+    response.raise_for_status()  # 如果状态码不是200，抛出异常
														
 
															+    result = response.json()
														
 
															+    return result["data"][0]["embedding"]
														
 
															+
														
 
															+def parse_pattern_res(json_data) -> Dict[str, Dict[str, str]]:
														
 
															+    """
														
 
															+    解析 pattern_res.json 文件，提取两类信息：
														
 
															+    1. 所有 "模式ID","模式命名","模式说明" 字段的 path 与 value 映射
														
 
															+    
														
 
															+    返回:
														
 
															+        {
														
 
															+            "模式ID": {path: value, ...},
														
 
															+            "模式命名": {path: value, ...},
														
 
															+            "模式说明": {path: value, ...}
														
 
															+        }
														
 
															+    """
														
 
															+    data = json_data
														
 
															+    pattern_dict: Dict[str, Any] = {}
														
 
															+    def traverse(obj: Any, current_path: str = ""):
														
 
															+        """递归遍历 JSON 结构，记录目标字段"""
														
 
															+        if isinstance(obj, dict):
														
 
															+            for k, v in obj.items():
														
 
															+                # 构建新路径，避免在开头添加点号
														
 
															+                new_path = f"{current_path}.{k}" if current_path else k
														
 
															+                if k == "模式ID":
														
 
															+                    # 当遇到“模式ID”时，同时获取同层的“模式命名”和“模式描述”
														
 
															+                    temp_dict ={}
														
 
															+                    temp_dict["模式ID"] = v
														
 
															+                    temp_dict["模式命名"] = obj.get("模式命名", "")
														
 
															+                    temp_dict["模式说明"] = obj.get("模式说明", "")
														
 
															+
														
 
															+                    pattern_dict[current_path] = temp_dict
														
 
															+
														
 
															+                traverse(v, new_path)
														
 
															+        
														
 
															+        elif isinstance(obj, list):
														
 
															+            for idx, item in enumerate(obj):
														
 
															+                # 对于数组元素，使用方括号索引
														
 
															+                new_path = f"{current_path}[{idx}]"
														
 
															+                traverse(item, new_path)
														
 
															+    traverse(data)
														
 
															+    return {"pattern": pattern_dict}
														
 
															+
														
 
															+# 使用示例
														
 
															+if __name__ == "__main__":
														
 
															+    # 连接 MongoDB 数据库
														
 
															+    ##################### 存储到mongoDB
														
 
															+    MONGO_URI = "mongodb://localhost:27017/"
														
 
															+    DB_NAME = "mydeconstruct"
														
 
															+    COLL_NAME = "deconstruct_how"
														
 
															+
														
 
															+    client = MongoClient(MONGO_URI)
														
 
															+    db = client[DB_NAME]
														
 
															+    coll = db[COLL_NAME]
														
 
															+
														
 
															+    # 读取并插入 JSON 文件
														
 
															+    json_path = "/home/ecs-user/project/colpali/src/pattern_res.json"
														
 
															+
														
 
															+    with open(json_path, "r", encoding="utf-8") as f:
														
 
															+        doc = json.load(f)
														
 
															+
														
 
															+    result = parse_pattern_res(doc)
														
 
															+
														
 
															+    for key, value in result["pattern"].items():
														
 
															+        print(f"pattern 字段 {key} 的值为: {value}")
														
 
															+
														
 
															+    # exit()
														
 
															+    insert_result = coll.insert_one(doc)
														
 
															+    inserted_id = insert_result.inserted_id
														
 
															+
														
 
															+    ##################### 将 result["how"] 中的每个 value 转换为向量并插入 Milvus
														
 
															+    ########## 文本向量库存一份how
														
 
															+    # 创建 Milvus 集合（如不存在）
														
 
															+    collection_name = "deconstruct_pattern"
														
 
															+    if not utility.has_collection(collection_name): 
														
 
															+        # utility.drop_collection(collection_name)
														
 
															+        fields = [
														
 
															+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
														
 
															+            FieldSchema(name="mongo_id", dtype=DataType.VARCHAR, max_length=64),
														
 
															+            FieldSchema(name="pattern_id", dtype=DataType.VARCHAR, max_length=64),
														
 
															+            FieldSchema(name="pattern_name", dtype=DataType.VARCHAR, max_length=128),
														
 
															+            FieldSchema(name="pattern_desc", dtype=DataType.VARCHAR, max_length=2048),
														
 
															+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=512),
														
 
															+            FieldSchema(name="name_embedding", dtype=DataType.FLOAT_VECTOR, dim=2560),
														
 
															+            FieldSchema(name="desc_embedding", dtype=DataType.FLOAT_VECTOR, dim=2560)
														
 
															+        ]
														
 
															+        schema = CollectionSchema(fields, description="Deconstruct how embeddings")
														
 
															+        collection = Collection(name=collection_name, schema=schema)
														
 
															+        # 创建 IVF_FLAT 索引
														
 
															+        index_params = {
														
 
															+            "metric_type": "IP",
														
 
															+            "index_type": "IVF_FLAT",
														
 
															+            "params": {"nlist": 128}
														
 
															+        }
														
 
															+
														
 
															+        # 为 pattern_id 字段创建字符串索引
														
 
															+        collection.create_index("pattern_id", {
														
 
															+            "index_type": "INVERTED" #"Trie"
														
 
															+        })
														
 
															+        collection.create_index("name_embedding", index_params)
														
 
															+        collection.create_index("desc_embedding", index_params)
														
 
															+    else:
														
 
															+        collection = Collection(name=collection_name)
														
 
															+
														
 
															+    entities = []
														
 
															+    for key, value in result["pattern"].items():
														
 
															+        pattern_id = value["模式ID"]
														
 
															+        pattern_name = value["模式命名"]
														
 
															+        pattern_desc = value["模式说明"]
														
 
															+
														
 
															+        ### 访问可达则替换
														
 
															+        # name_embedding = get_basic_embedding(pattern_name, model=DEFAULT_MODEL)
														
 
															+        # desc_embedding = get_basic_embedding(pattern_desc, model=DEFAULT_MODEL)
														
 
															+        ###
														
 
															+        name_embedding = np.random.rand(2560).tolist()
														
 
															+        desc_embedding = np.random.rand(2560).tolist()
														
 
															+
														
 
															+        path = key
														
 
															+        entities.append({
														
 
															+            "mongo_id": str(inserted_id),
														
 
															+            "pattern_id": pattern_id, 
														
 
															+            "pattern_name": pattern_name,
														
 
															+            "pattern_desc": pattern_desc,
														
 
															+            "path": path,
														
 
															+            "name_embedding": name_embedding,
														
 
															+            "desc_embedding": desc_embedding
														
 
															+        })
														
 
															+    # 遍历 result["pattern"]，生成 embeddings 并插入 Milvus
														
 
															+    # print("entities is ", entities)
														
 
															+    if entities:
														
 
															+        collection.insert(entities)
														
 
															+        collection.flush()
														
 
															+        print(f"已插入 {len(entities)} 条 how 字段向量到 Milvus")
														
 
															+    else:
														
 
															+        print("未找到 how 字段，未插入向量")
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
--- a/deconstruct_SQI/milvus_pattern_query.py
+++ b/deconstruct_SQI/milvus_pattern_query.py
@@ -0,0 +1,87 @@
 
															+from ast import Import
														
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+from bson import ObjectId
														
 
															+from pydub import AudioSegment
														
 
															+import io
														
 
															+from scipy.io import wavfile
														
 
															+
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+##################### mongoDB
														
 
															+MONGO_URI = "mongodb://localhost:27017/"
														
 
															+DB_NAME = "mydeconstruct"
														
 
															+COLL_NAME = "deconstruct_how"
														
 
															+
														
 
															+client = MongoClient(MONGO_URI)
														
 
															+db = client[DB_NAME]
														
 
															+coll = db[COLL_NAME]
														
 
															+##################### mongoDB
														
 
															+
														
 
															+##################### 路径解析返回
														
 
															+def resolve_mongo_path(mongo_id: str, path: str):
														
 
															+    """
														
 
															+    根据 mongo_id 与形如 '文本元素[1].子节点元素[0].what' 的路径字符串，
														
 
															+    从 MongoDB 中定位并返回对应的对象。
														
 
															+    """
														
 
															+    doc = coll.find_one({"_id": ObjectId(mongo_id)})
														
 
															+    if not doc:
														
 
															+        return None
														
 
															+
														
 
															+    # 将路径按 '.' 分割，逐级访问
														
 
															+    parts = path.split('.')
														
 
															+    current = doc
														
 
															+    for part in parts:
														
 
															+        # 处理数组索引，如 子节点元素[0]
														
 
															+        if '[' in part and part.endswith(']'):
														
 
															+            key, idx_str = part.split('[', 1)
														
 
															+            idx = int(idx_str[:-1])  # 去掉 ']'
														
 
															+            current = current[key][idx]
														
 
															+        else:
														
 
															+            current = current[part]
														
 
															+    return current
														
 
															+##################### 路径解析返回
														
 
															+
														
 
															+search_mode ="id_search"  # "name_search" ,"desc_search", "id_search"
														
 
															+
														
 
															+if search_mode == "id_search":
														
 
															+    ##################query what
														
 
															+    ##################
														
 
															+    milvus_client = Collection(name="deconstruct_pattern")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_pattern"
														
 
															+    if not utility.has_collection(collection_name):
														
 
															+        print(f"no collection named {collection_name}")
														
 
															+    else:
														
 
															+        # 查询并打印 collection 中的所有记录
														
 
															+        print(f"正在查询 collection '{collection_name}' 中的所有记录...")
														
 
															+        try:
														
 
															+            # 使用 query 方法获取所有记录，不设置过滤条件
														
 
															+            all_records = milvus_client.query(
														
 
															+                expr="pattern_id =='pattern_制作-图集-形容词_2' ",  # 空表达式表示查询所有
														
 
															+                output_fields=["mongo_id","pattern_id","pattern_name","pattern_desc","path"],  # 输出所有字段
														
 
															+                limit=10  # 设置一个较大的上限，确保能获取全部
														
 
															+            )
														
 
															+            print(f"共查询到 {len(all_records)} 条记录：")
														
 
															+            for record in all_records:
														
 
															+                print(record)
														
 
															+                rec = resolve_mongo_path(record["mongo_id"], record["path"])
														
 
															+                print("定位items：",rec)
														
 
															+                # docres = coll.find_one({"_id": ObjectId(record["mongo_id"])})
														
 
															+                # print(docres)
														
 
															+        except Exception as e:
														
 
															+            print(f"查询失败：{e}")
														
--- a/deconstruct_SQI/milvus_pattern_search.py
+++ b/deconstruct_SQI/milvus_pattern_search.py
@@ -0,0 +1,207 @@
 
															+from ast import Import
														
 
															+from pymilvus import Collection, connections, utility, FieldSchema, CollectionSchema, DataType
														
 
															+import requests
														
 
															+import json
														
 
															+from typing import Dict, Any, List
														
 
															+from pymongo import MongoClient
														
 
															+from bson import ObjectId
														
 
															+from pydub import AudioSegment
														
 
															+import io, os
														
 
															+from scipy.io import wavfile
														
 
															+import numpy
														
 
															+
														
 
															+################################连接milvus数据库 A
														
 
															+# 配置信息
														
 
															+MILVUS_CONFIG = {
														
 
															+    "host": "c-981be0ee7225467b-internal.milvus.aliyuncs.com",
														
 
															+    "user": "root",
														
 
															+    "password": "Piaoquan@2025",
														
 
															+    "port": "19530",
														
 
															+}
														
 
															+print("正在连接 Milvus 数据库...")
														
 
															+connections.connect("default", **MILVUS_CONFIG)
														
 
															+print("连接成功！")
														
 
															+################################连接milvus数据库 B
														
 
															+
														
 
															+# ##################################引入多模态向量模型#################
														
 
															+# import torch
														
 
															+# from PIL import Image
														
 
															+# from transformers.utils.import_utils import is_flash_attn_2_available
														
 
															+# from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
														
 
															+
														
 
															+# model = ColQwen2_5Omni.from_pretrained(
														
 
															+#     "vidore/colqwen-omni-v0.1",
														
 
															+#     torch_dtype=torch.bfloat16,
														
 
															+#     device_map="cuda",  # or "mps" if on Apple Silicon
														
 
															+#     attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
														
 
															+# ).eval()
														
 
															+# processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
														
 
															+# ##################################引入多模态向量模型#################
														
 
															+
														
 
															+##################### mongoDB
														
 
															+MONGO_URI = "mongodb://localhost:27017/"
														
 
															+DB_NAME = "mydeconstruct"
														
 
															+COLL_NAME = "deconstruct_how"
														
 
															+
														
 
															+client = MongoClient(MONGO_URI)
														
 
															+db = client[DB_NAME]
														
 
															+coll = db[COLL_NAME]
														
 
															+##################### mongoDB
														
 
															+
														
 
															+#####################text embedding serviceS
														
 
															+VLLM_SERVER_URL = "http://192.168.100.31:8000/v1/embeddings"
														
 
															+DEFAULT_MODEL = "/models/Qwen3-Embedding-4B"
														
 
															+
														
 
															+def get_basic_embedding(text: str, model=DEFAULT_MODEL):
														
 
															+    """通过HTTP调用在线embedding服务"""
														
 
															+    headers = {
														
 
															+        "Content-Type": "application/json"
														
 
															+    }
														
 
															+    data = {
														
 
															+        "model": model,
														
 
															+        "input": text
														
 
															+    } 
														
 
															+    response = requests.post(
														
 
															+        VLLM_SERVER_URL,
														
 
															+        headers=headers,
														
 
															+        json=data,
														
 
															+        timeout=5  # 添加超时设置
														
 
															+    )
														
 
															+    response.raise_for_status()  # 如果状态码不是200，抛出异常
														
 
															+    result = response.json()
														
 
															+    return result["data"][0]["embedding"]
														
 
															+#####################text embedding serviceS
														
 
															+#####################multi vector search
														
 
															+import numpy as np
														
 
															+from collections import defaultdict
														
 
															+from typing import List, Dict, Tuple
														
 
															+
														
 
															+###############multi vector search
														
 
															+def search_topk_multi(
														
 
															+    collection: Collection,
														
 
															+    query_vecs: List[List[float]],  # 查询向量列表 [vec1, vec2, ...]
														
 
															+    topk: int = 2
														
 
															+) -> List[Tuple[str, float]]:
														
 
															+    """
														
 
															+    对查询向量列表检索，计算每个对象的平均最大相似度，返回 TopK 对象
														
 
															+    
														
 
															+    参数：
														
 
															+        collection: Milvus 集合实例
														
 
															+        query_vecs: 查询向量列表（每个向量维度需与集合一致）
														
 
															+        topk: 返回的 top 数量
														
 
															+    
														
 
															+    返回：
														
 
															+        排序后的列表，元素为 (object_id, 平均最大相似度)
														
 
															+    """
														
 
															+    # 步骤1：逐个检索查询向量，收集每个对象的最大相似度
														
 
															+    all_query_results = []  # 存储每个查询的 {object_id: 最大相似度}
														
 
															+    for q_idx, q_vec in enumerate[List[float]](query_vecs):
														
 
															+        # 检索当前查询向量
														
 
															+        search_params = {
														
 
															+            "metric_type": "IP",
														
 
															+            "params": {"nprobe": 10}
														
 
															+        }
														
 
															+        results = collection.search(
														
 
															+            data=[q_vec],
														
 
															+            anns_field="embedding",
														
 
															+            param=search_params,
														
 
															+            limit=16384,  # Milvus最大允许的topk值
														
 
															+            output_fields=["mongo_id", "type", "path"],
														
 
															+            expr='type == "image"'  # 只检索type为text的记录
														
 
															+        )
														
 
															+        # 按 object_id 分组取最大相似度
														
 
															+        query_object_sim = defaultdict(float)
														
 
															+        for hit in results[0]:
														
 
															+            obj_id = hit.entity.get("mongo_id")
														
 
															+            sim = hit.score
														
 
															+            if sim > query_object_sim[obj_id]:
														
 
															+                query_object_sim[obj_id] = sim
														
 
															+        
														
 
															+        all_query_results.append(query_object_sim)
														
 
															+        print(f"查询向量 {q_idx+1}/{len(query_vecs)} 处理完成，覆盖 {len(query_object_sim)} 个对象")
														
 
															+    
														
 
															+    # 步骤2：计算每个对象的平均最大相似度
														
 
															+    all_object_ids = set()
														
 
															+    for res in all_query_results:
														
 
															+        all_object_ids.update(res.keys())  # 收集所有出现过的对象
														
 
															+    
														
 
															+    object_avg_sim = {}
														
 
															+    for obj_id in all_object_ids:
														
 
															+        sims = [res.get(obj_id, 0.0) for res in all_query_results]  # 未匹配的查询按0处理
														
 
															+        avg_sim = sum(sims) / len(query_vecs)  # 计算平均值
														
 
															+        object_avg_sim[obj_id] = avg_sim
														
 
															+    
														
 
															+    # 步骤3：按平均相似度排序并取 TopK
														
 
															+    sorted_objects = sorted(
														
 
															+        object_avg_sim.items(),
														
 
															+        key=lambda x: x[1],
														
 
															+        reverse=True
														
 
															+    )[:topk]
														
 
															+    return sorted_objects
														
 
															+##################文本数据库 search
														
 
															+###############single vector search
														
 
															+def search_topk_single(
														
 
															+    collection: Collection,
														
 
															+    query_vec: List[float],  # 查询向量
														
 
															+    topk: int = 2
														
 
															+) -> List[dict]:
														
 
															+    """
														
 
															+    对单个查询向量检索，计算每个对象的最大相似度，返回 TopK 对象
														
 
															+    
														
 
															+    参数：
														
 
															+        collection: Milvus 集合实例
														
 
															+        query_vec: 查询向量（维度需与集合一致）
														
 
															+        topk: 返回的 top 数量
														
 
															+    
														
 
															+    返回：
														
 
															+        排序后的列表，元素为 (object_id, 最大相似度)
														
 
															+    """
														
 
															+    # 步骤1：检索当前查询向量
														
 
															+    search_params = {
														
 
															+        "metric_type": "IP",
														
 
															+        "params": {"nprobe": 10}
														
 
															+    }
														
 
															+    results = collection.search(
														
 
															+        data=[query_vec],
														
 
															+        anns_field="name_embedding", #'desc_embedding'
														
 
															+        param=search_params,
														
 
															+        limit=topk,  # Milvus最大允许的topk值
														
 
															+        output_fields=["mongo_id","pattern_id","pattern_name","pattern_desc","path"]
														
 
															+    )
														
 
															+    return results[0]
														
 
															+###############single vector search
														
 
															+
														
 
															+search_mode = "name_search" #"desc_search"
														
 
															+
														
 
															+if search_mode =="name_search":
														
 
															+    ##################添加what_search
														
 
															+    ########模拟计算出的embedding
														
 
															+
														
 
															+    query = '#假如食物会说话'
														
 
															+    
														
 
															+    # queries_embeddings =  get_basic_embedding(text = query )
														
 
															+    #########暂时代替
														
 
															+    import numpy as np
														
 
															+    q_vec = list(np.random.randn(2560))
														
 
															+    #########暂时代替
														
 
															+
														
 
															+    milvus_client = Collection(name="deconstruct_pattern")
														
 
															+    milvus_client.load()
														
 
															+    collection_name = "deconstruct_pattern"
														
 
															+
														
 
															+    search_params = {
														
 
															+        "metric_type": "IP",
														
 
															+        "params": {"nprobe": 10}
														
 
															+    }
														
 
															+
														
 
															+    results = search_topk_single(milvus_client,q_vec,topk= 3)
														
 
															+    print("results is ", results)
														
 
															+    for i,record in enumerate(results):
														
 
															+        #########暂时代替############
														
 
															+        if record['mongo_id'] =='10000000':
														
 
															+            mongo_id = '68f894176a7850acc4851b27'
														
 
															+        else:
														
 
															+            mongo_id = record['mongo_id']
														
 
															+        #########暂时代替############
														
 
															+        docres = coll.find_one({"_id": ObjectId(mongo_id)})
														
 
															+        # print(f"第{i+1}个结果*********************：{docres}\n")