123456789101112131415161718192021222324 |
- from transformers import BertModel, BertTokenizer
- from typing import List
- # 加载预训练的BERT模型和对应的tokenizer
- model = BertModel.from_pretrained(
- '/Users/sunxy/vscode_project/bitflow/2023_11/title-embedding-base-on-milvus/model/bert-base-chinese/')
- tokenizer = BertTokenizer.from_pretrained(
- '/Users/sunxy/vscode_project/bitflow/2023_11/title-embedding-base-on-milvus/model/vocab.txt')
- def text_to_vector(text) -> List[float]:
- # 使用tokenizer将文本转化为模型需要的格式,这里我们只取一个文本所以使用encode而非batch_encode
- inputs = tokenizer(text, return_tensors='pt')
- # 用BERT模型处理输入数据
- outputs = model(**inputs)
- # 提取嵌入向量
- embeddings = outputs.last_hidden_state # 最后一层的隐藏状态
- # 将嵌入向量转为NumPy数组
- embeddings = embeddings.detach().numpy().tolist()[0][0]
- return embeddings
|