BertEmbedding.py 925 B

123456789101112131415161718192021222324
  1. from transformers import BertModel, BertTokenizer
  2. from typing import List
  3. # 加载预训练的BERT模型和对应的tokenizer
  4. model = BertModel.from_pretrained(
  5. '/Users/sunxy/vscode_project/bitflow/2023_11/title-embedding-base-on-milvus/model/bert-base-chinese/')
  6. tokenizer = BertTokenizer.from_pretrained(
  7. '/Users/sunxy/vscode_project/bitflow/2023_11/title-embedding-base-on-milvus/model/vocab.txt')
  8. def text_to_vector(text) -> List[float]:
  9. # 使用tokenizer将文本转化为模型需要的格式,这里我们只取一个文本所以使用encode而非batch_encode
  10. inputs = tokenizer(text, return_tensors='pt')
  11. # 用BERT模型处理输入数据
  12. outputs = model(**inputs)
  13. # 提取嵌入向量
  14. embeddings = outputs.last_hidden_state # 最后一层的隐藏状态
  15. # 将嵌入向量转为NumPy数组
  16. embeddings = embeddings.detach().numpy().tolist()[0][0]
  17. return embeddings