video_semantic_emb_bert.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. # encoding: utf-8
  2. import csv
  3. import numpy as np
  4. import time
  5. from bert_serving.client import BertClient
  6. begin_time = time.time()
  7. bc = BertClient()
  8. #bc = BertClient(ip='192.168.204.120', port=5555)
  9. csvFile2 = open("./embedding_semantic_videoTzld1116-alldata.csv",'w',newline='',encoding='utf-8')
  10. writer = csv.writer(csvFile2)
  11. csvRow1 = ['videoId', 'semantic_embedding']
  12. writer.writerow(csvRow1)
  13. csvRow = []
  14. is_title = True
  15. idx = 0
  16. fileName = "./video_words20201115.csv"
  17. with open(fileName, "r") as csvFile1:
  18. reader = csv.reader(csvFile1)
  19. for line in reader:
  20. if is_title:
  21. is_title = False
  22. continue
  23. print("idx is: " + str(idx))
  24. line_list = line
  25. video_id = line_list[0]
  26. print("video is is: " , video_id)
  27. vector_str = line_list[1]
  28. str1 = vector_str
  29. print("vector is: ")
  30. print(str1)
  31. data = []
  32. data.append(vector_str)
  33. vectors = bc.encode(data)
  34. print("vectors.size is: ", vectors.size)
  35. print("vectors.shape is: ", vectors.shape)
  36. vectors_list = vectors.tolist()
  37. # str_list = str1.split(" ")
  38. # str_list = str1.strip('\n').split() # 第一种方法
  39. # str_list = list(filter(None, str1.split(" "))) # 第二种方法
  40. res_row = []
  41. res_str = ""
  42. res_str += video_id + ':'
  43. res_str += str(vectors_list)[2: -2]
  44. res_row.append(res_str)
  45. writer.writerow(res_row)
  46. idx += 1
  47. csvFile2.close()
  48. csvFile1.close()
  49. print("update csv file cost time is: " + str(time.time() - begin_time))