1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- #coding utf-8
- import sys
- from gensim import models
- import numpy as np
- if __name__=="__main__":
- #model = models.word2vec.Word2Vec.load('word2vec.txt')
- #print(model.wx)
- f1 = open('word2vec.txt')
- word_dict = {}
- while True:
- line = f1.readline()
- if not line:
- break
- items = line.strip().split(" ")
- if len(items)<64:
- continue
- arr = []
- for w in items[1:]:
- arr.append(float(w))
- word_dict[items[0]] = arr
- #print(word_dict)
- f = open(sys.argv[1])
- num = 0
- while True:
- line = f.readline()
- if not line:
- break
- num = num+1
- if num == 1:
- continue
- items = line.split("\t")
- if len(items)<2:
- continue
- vid = items[0]
- title_arr = items[1].split(" ")
- title_info = np.zeros(64)
- word_len = 0
- for word in title_arr:
- if word in word_dict:
- #print(title_info)
- #print(word)
- word_vec = word_dict[word]
- #print(word_vec)
- title_info = np.add(title_info, word_vec)
- word_len +=1
- #print(title_info)
- title_info_list = []
- if word_len<=0:
- continue
- for j in title_info:
- title_info_list.append(j/word_len)
- #print("title_info_list:", title_info_list)
- print(vid,"\t",title_info_list)
-
-
|