from transformers import BertModel, BertTokenizer from typing import List # 加载预训练的BERT模型和对应的tokenizer model = BertModel.from_pretrained( '/Users/sunxy/vscode_project/bitflow/2023_11/title-embedding-base-on-milvus/model/bert-base-chinese/') tokenizer = BertTokenizer.from_pretrained( '/Users/sunxy/vscode_project/bitflow/2023_11/title-embedding-base-on-milvus/model/vocab.txt') def text_to_vector(text) -> List[float]: # 使用tokenizer将文本转化为模型需要的格式,这里我们只取一个文本所以使用encode而非batch_encode inputs = tokenizer(text, return_tensors='pt') # 用BERT模型处理输入数据 outputs = model(**inputs) # 提取嵌入向量 embeddings = outputs.last_hidden_state # 最后一层的隐藏状态 # 将嵌入向量转为NumPy数组 embeddings = embeddings.detach().numpy().tolist()[0][0] return embeddings