12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- #coding utf-8
- import sys
- from gensim import models
- import numpy as np
- if __name__=="__main__":
- #model = models.word2vec.Word2Vec.load('word2vec.txt')
- #print(model.wx)
- f1 = open('word2vec.txt')
- word_dict = {}
- while True:
- line = f1.readline()
- if not line:
- break
- items = line.strip().split(" ")
- if len(items)<100:
- continue
- arr = []
- for w in items[1:]:
- arr.append(float(w))
- word_dict[items[0]] = arr
- #print(word_dict)
- f = open(sys.argv[1])
- num = 0
- while True:
- line = f.readline()
- if not line:
- break
- num = num+1
- if num == 1:
- continue
- items = line.split("\t")
- if len(items)<2:
- continue
- vid = items[0]
- title_arr = items[1].split(" ")
- title_info = np.zeros(100)
- word_len = 0
- for word in title_arr:
- if word in word_dict:
- #print(title_info)
- #print(word)
- word_vec = word_dict[word]
- #print(word_vec)
- title_info = np.add(title_info, word_vec)
- word_len +=1
- title_info_list = []
- if word_len<=0:
- continue
- for j in title_info:
- title_info_list.append(j/word_len)
- #print("title_info_list:", title_info_list)
- print(vid,title_info_list)
-
-
|