#coding utf-8 import sys from gensim import models import numpy as np if __name__=="__main__": #model = models.word2vec.Word2Vec.load('word2vec.txt') #print(model.wx) f1 = open('word2vec.txt') word_dict = {} while True: line = f1.readline() if not line: break items = line.strip().split(" ") if len(items)<64: continue arr = [] for w in items[1:]: arr.append(float(w)) word_dict[items[0]] = arr #print(word_dict) f = open(sys.argv[1]) num = 0 while True: line = f.readline() if not line: break num = num+1 if num == 1: continue items = line.split("\t") if len(items)<2: continue vid = items[0] title_arr = items[1].split(" ") title_info = np.zeros(64) word_len = 0 for word in title_arr: if word in word_dict: #print(title_info) #print(word) word_vec = word_dict[word] #print(word_vec) title_info = np.add(title_info, word_vec) word_len +=1 #print(title_info) title_info_list = [] if word_len<=0: continue for j in title_info: title_info_list.append(j/word_len) #print("title_info_list:", title_info_list) print(vid,"\t",title_info_list)