123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- # encoding: utf-8
- import csv
- import numpy as np
- import time
- from bert_serving.client import BertClient
- begin_time = time.time()
- bc = BertClient()
- #bc = BertClient(ip='192.168.204.120', port=5555)
- csvFile2 = open("./embedding_semantic_videoTzld1116-alldata.csv",'w',newline='',encoding='utf-8')
- writer = csv.writer(csvFile2)
- csvRow1 = ['videoId', 'semantic_embedding']
- writer.writerow(csvRow1)
- csvRow = []
- is_title = True
- idx = 0
- fileName = "./video_words20201115.csv"
- with open(fileName, "r") as csvFile1:
- reader = csv.reader(csvFile1)
- for line in reader:
- if is_title:
- is_title = False
- continue
- print("idx is: " + str(idx))
- line_list = line
- video_id = line_list[0]
- print("video is is: " , video_id)
- vector_str = line_list[1]
- str1 = vector_str
- print("vector is: ")
- print(str1)
- data = []
- data.append(vector_str)
- vectors = bc.encode(data)
- print("vectors.size is: ", vectors.size)
- print("vectors.shape is: ", vectors.shape)
- vectors_list = vectors.tolist()
- # str_list = str1.split(" ")
- # str_list = str1.strip('\n').split() # 第一种方法
- # str_list = list(filter(None, str1.split(" "))) # 第二种方法
- res_row = []
- res_str = ""
- res_str += video_id + ':'
- res_str += str(vectors_list)[2: -2]
- res_row.append(res_str)
- writer.writerow(res_row)
- idx += 1
- csvFile2.close()
- csvFile1.close()
- print("update csv file cost time is: " + str(time.time() - begin_time))
|